import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("/Users/prose/OneDrive/Desktop/Data/data.csv")
df.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 158023 | L. Messi | 31 | https://cdn.sofifa.org/players/4/19/158023.png | Argentina | https://cdn.sofifa.org/flags/52.png | 94 | 94 | FC Barcelona | ... | 96.0 | 33.0 | 28.0 | 26.0 | 6.0 | 11.0 | 15.0 | 14.0 | 8.0 | €226.5M |
| 1 | 1 | 20801 | Cristiano Ronaldo | 33 | https://cdn.sofifa.org/players/4/19/20801.png | Portugal | https://cdn.sofifa.org/flags/38.png | 94 | 94 | Juventus | ... | 95.0 | 28.0 | 31.0 | 23.0 | 7.0 | 11.0 | 15.0 | 14.0 | 11.0 | €127.1M |
| 2 | 2 | 190871 | Neymar Jr | 26 | https://cdn.sofifa.org/players/4/19/190871.png | Brazil | https://cdn.sofifa.org/flags/54.png | 92 | 93 | Paris Saint-Germain | ... | 94.0 | 27.0 | 24.0 | 33.0 | 9.0 | 9.0 | 15.0 | 15.0 | 11.0 | €228.1M |
| 3 | 3 | 193080 | De Gea | 27 | https://cdn.sofifa.org/players/4/19/193080.png | Spain | https://cdn.sofifa.org/flags/45.png | 91 | 93 | Manchester United | ... | 68.0 | 15.0 | 21.0 | 13.0 | 90.0 | 85.0 | 87.0 | 88.0 | 94.0 | €138.6M |
| 4 | 4 | 192985 | K. De Bruyne | 27 | https://cdn.sofifa.org/players/4/19/192985.png | Belgium | https://cdn.sofifa.org/flags/7.png | 91 | 92 | Manchester City | ... | 88.0 | 68.0 | 58.0 | 51.0 | 15.0 | 13.0 | 5.0 | 10.0 | 13.0 | €196.4M |
5 rows × 89 columns
#checking the column names
df.columns
Index(['Unnamed: 0', 'ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag',
'Overall', 'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
'Preferred Foot', 'International Reputation', 'Weak Foot',
'Skill Moves', 'Work Rate', 'Body Type', 'Real Face', 'Position',
'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
'Height', 'Weight', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'Crossing',
'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
'GKKicking', 'GKPositioning', 'GKReflexes', 'Release Clause'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 18207 entries, 0 to 18206 Data columns (total 89 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 18207 non-null int64 1 ID 18207 non-null int64 2 Name 18207 non-null object 3 Age 18207 non-null int64 4 Photo 18207 non-null object 5 Nationality 18207 non-null object 6 Flag 18207 non-null object 7 Overall 18207 non-null int64 8 Potential 18207 non-null int64 9 Club 17966 non-null object 10 Club Logo 18207 non-null object 11 Value 18207 non-null object 12 Wage 18207 non-null object 13 Special 18207 non-null int64 14 Preferred Foot 18159 non-null object 15 International Reputation 18159 non-null float64 16 Weak Foot 18159 non-null float64 17 Skill Moves 18159 non-null float64 18 Work Rate 18159 non-null object 19 Body Type 18159 non-null object 20 Real Face 18159 non-null object 21 Position 18147 non-null object 22 Jersey Number 18147 non-null float64 23 Joined 16654 non-null object 24 Loaned From 1264 non-null object 25 Contract Valid Until 17918 non-null object 26 Height 18159 non-null object 27 Weight 18159 non-null object 28 LS 16122 non-null object 29 ST 16122 non-null object 30 RS 16122 non-null object 31 LW 16122 non-null object 32 LF 16122 non-null object 33 CF 16122 non-null object 34 RF 16122 non-null object 35 RW 16122 non-null object 36 LAM 16122 non-null object 37 CAM 16122 non-null object 38 RAM 16122 non-null object 39 LM 16122 non-null object 40 LCM 16122 non-null object 41 CM 16122 non-null object 42 RCM 16122 non-null object 43 RM 16122 non-null object 44 LWB 16122 non-null object 45 LDM 16122 non-null object 46 CDM 16122 non-null object 47 RDM 16122 non-null object 48 RWB 16122 non-null object 49 LB 16122 non-null object 50 LCB 16122 non-null object 51 CB 16122 non-null object 52 RCB 16122 non-null object 53 RB 16122 non-null object 54 Crossing 18159 non-null float64 55 Finishing 18159 non-null float64 56 HeadingAccuracy 18159 non-null float64 57 ShortPassing 18159 non-null float64 58 Volleys 18159 non-null float64 59 Dribbling 18159 non-null float64 60 Curve 18159 non-null float64 61 FKAccuracy 18159 non-null float64 62 LongPassing 18159 non-null float64 63 BallControl 18159 non-null float64 64 Acceleration 18159 non-null float64 65 SprintSpeed 18159 non-null float64 66 Agility 18159 non-null float64 67 Reactions 18159 non-null float64 68 Balance 18159 non-null float64 69 ShotPower 18159 non-null float64 70 Jumping 18159 non-null float64 71 Stamina 18159 non-null float64 72 Strength 18159 non-null float64 73 LongShots 18159 non-null float64 74 Aggression 18159 non-null float64 75 Interceptions 18159 non-null float64 76 Positioning 18159 non-null float64 77 Vision 18159 non-null float64 78 Penalties 18159 non-null float64 79 Composure 18159 non-null float64 80 Marking 18159 non-null float64 81 StandingTackle 18159 non-null float64 82 SlidingTackle 18159 non-null float64 83 GKDiving 18159 non-null float64 84 GKHandling 18159 non-null float64 85 GKKicking 18159 non-null float64 86 GKPositioning 18159 non-null float64 87 GKReflexes 18159 non-null float64 88 Release Clause 16643 non-null object dtypes: float64(38), int64(6), object(45) memory usage: 12.4+ MB
df.describe()
| Unnamed: 0 | ID | Age | Overall | Potential | Special | International Reputation | Weak Foot | Skill Moves | Jersey Number | ... | Penalties | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18147.000000 | ... | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 |
| mean | 9103.000000 | 214298.338606 | 25.122206 | 66.238699 | 71.307299 | 1597.809908 | 1.113222 | 2.947299 | 2.361308 | 19.546096 | ... | 48.548598 | 58.648274 | 47.281623 | 47.697836 | 45.661435 | 16.616223 | 16.391596 | 16.232061 | 16.388898 | 16.710887 |
| std | 5256.052511 | 29965.244204 | 4.669943 | 6.908930 | 6.136496 | 272.586016 | 0.394031 | 0.660456 | 0.756164 | 15.947765 | ... | 15.704053 | 11.436133 | 19.904397 | 21.664004 | 21.289135 | 17.695349 | 16.906900 | 16.502864 | 17.034669 | 17.955119 |
| min | 0.000000 | 16.000000 | 16.000000 | 46.000000 | 48.000000 | 731.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 5.000000 | 3.000000 | 3.000000 | 2.000000 | 3.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 25% | 4551.500000 | 200315.500000 | 21.000000 | 62.000000 | 67.000000 | 1457.000000 | 1.000000 | 3.000000 | 2.000000 | 8.000000 | ... | 39.000000 | 51.000000 | 30.000000 | 27.000000 | 24.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 |
| 50% | 9103.000000 | 221759.000000 | 25.000000 | 66.000000 | 71.000000 | 1635.000000 | 1.000000 | 3.000000 | 2.000000 | 17.000000 | ... | 49.000000 | 60.000000 | 53.000000 | 55.000000 | 52.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 |
| 75% | 13654.500000 | 236529.500000 | 28.000000 | 71.000000 | 75.000000 | 1787.000000 | 1.000000 | 3.000000 | 3.000000 | 26.000000 | ... | 60.000000 | 67.000000 | 64.000000 | 66.000000 | 64.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 |
| max | 18206.000000 | 246620.000000 | 45.000000 | 94.000000 | 95.000000 | 2346.000000 | 5.000000 | 5.000000 | 5.000000 | 99.000000 | ... | 92.000000 | 96.000000 | 94.000000 | 93.000000 | 91.000000 | 90.000000 | 92.000000 | 91.000000 | 90.000000 | 94.000000 |
8 rows × 44 columns
import missingno as msno
msno.matrix(df, labels=True, sort="descending");
msno.bar(df)
<Axes: >
# import the required libraries
import sweetviz as sv
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
print("SweetViz Version : {}".format(sv.__version__))
# analyzing the dataset
report = sv.analyze(df)
# show the report in a form of an HTML file
report.show_html('Report.html')
SweetViz Version : 2.2.1
| | [ 0%] 00:00 -> (? left)
Report Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
df.shape
(18207, 89)
from dataprep.eda.missing import plot_missing
plot_missing(df)
0%| | 0/774 [00:00<?, ?it/s]
C:\Users\prose\anaconda3\Lib\site-packages\dask\core.py:121: RuntimeWarning: invalid value encountered in divide return func(*(_execute_task(a, cache) for a in args))
| Missing Cells | 76984 |
|---|---|
| Missing Cells (%) | 4.8% |
| Missing Columns | 76 |
| Missing Rows | 18207 |
| Avg Missing Cells per Column | 864.99 |
| Avg Missing Cells per Row | 4.23 |
# ignore warnings :
import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# for visualizations
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
plt.style.use('fivethirtyeight')
# to visualize missing data
import missingno as msno
# check for missing values
df.isnull().sum()
Unnamed: 0 0
ID 0
Name 0
Age 0
Photo 0
...
GKHandling 48
GKKicking 48
GKPositioning 48
GKReflexes 48
Release Clause 1564
Length: 89, dtype: int64
# fill missing values for continuous variables for proper data visualization
df['ShortPassing'].fillna(df['ShortPassing'].mean(), inplace = True)
df['Volleys'].fillna(df['Volleys'].mean(), inplace = True)
df['Dribbling'].fillna(df['Dribbling'].mean(), inplace = True)
df['Curve'].fillna(df['Curve'].mean(), inplace = True)
df['FKAccuracy'].fillna(df['FKAccuracy'], inplace = True)
df['LongPassing'].fillna(df['LongPassing'].mean(), inplace = True)
df['BallControl'].fillna(df['BallControl'].mean(), inplace = True)
df['HeadingAccuracy'].fillna(df['HeadingAccuracy'].mean(), inplace = True)
df['Finishing'].fillna(df['Finishing'].mean(), inplace = True)
df['Crossing'].fillna(df['Crossing'].mean(), inplace = True)
df['Weight'].fillna('200lbs', inplace = True)
df['Contract Valid Until'].fillna(2019, inplace = True)
df['Height'].fillna("5'11", inplace = True)
df['Loaned From'].fillna('None', inplace = True)
df['Joined'].fillna('Jul 1, 2018', inplace = True)
df['Jersey Number'].fillna(8, inplace = True)
df['Body Type'].fillna('Normal', inplace = True)
df['Position'].fillna('ST', inplace = True)
df['Club'].fillna('No Club', inplace = True)
df['Work Rate'].fillna('Medium/ Medium', inplace = True)
df['Skill Moves'].fillna(df['Skill Moves'].median(), inplace = True)
df['Weak Foot'].fillna(3, inplace = True)
df['Preferred Foot'].fillna('Right', inplace = True)
df['International Reputation'].fillna(1, inplace = True)
df['Wage'].fillna('€200K', inplace = True)
df.fillna(0, inplace = True)
df['Preferred Foot'].value_counts()
Right 13996 Left 4211 Name: Preferred Foot, dtype: int64
#Soccer players have a certain active timeframe in their lives. The distribution of age is here
df['Age'].hist()
plt.title("Distribution of age of the players")
Text(0.5, 1.0, 'Distribution of age of the players')
x = df['Age']
plt.figure(figsize = (12, 8))
plt.style.use('ggplot')
ax = sns.distplot(x, bins = 20, kde = True, color='g')
ax.set_xlabel(xlabel = 'Age of the Players', fontsize = 16)
ax.set_title(label = 'Histogram for Age distribution of Players', fontsize = 20)
plt.show()
df['International Reputation'].value_counts()
1.0 16580 2.0 1261 3.0 309 4.0 51 5.0 6 Name: International Reputation, dtype: int64
labels = df['International Reputation'].value_counts().index
size = df['International Reputation'].value_counts()
explode = [0, 0.1, 0.1, 0, 0]
plt.pie(size, labels = labels, explode = explode, shadow = True, autopct='%1.1f%%',startangle = 90)
plt.title('Distribution of International Reputation of players', fontsize = 20)
plt.legend()
plt.show()
df['Work Rate'].value_counts()
Medium/ Medium 9858 High/ Medium 3173 Medium/ High 1690 High/ High 1015 Medium/ Low 850 High/ Low 699 Low/ Medium 449 Low/ High 439 Low/ Low 34 Name: Work Rate, dtype: int64
fig, ax = plt.subplots(figsize=(12,8))
graph = sns.countplot(ax=ax,x=df['Work Rate'], data=df, palette = 'PuBuGn_d')
graph.set_title('Work Rate of the Players', fontsize = 20)
graph.set_xticklabels(graph.get_xticklabels(), rotation=30)
for p in graph.patches:
height = p.get_height()
graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
labels = df['Work Rate'].value_counts().index
size = df['Work Rate'].value_counts()
explode = [0,0,0.1,0,0.1,0,0,0,0]
plt.pie(size, labels = labels, explode = explode, shadow = True, autopct='%1.1f%%',startangle = 90)
plt.title('Distribution of Work Rate of players', fontsize = 20)
plt.legend()
plt.show()
fig, ax = plt.subplots(figsize=(12,8))
graph = sns.countplot(ax=ax,x=df['Work Rate'], data=df, hue='Preferred Foot', palette = 'PuBuGn_d')
graph.set_title('Work Rate of Players segregated by Preferred Foot' , fontsize = 20)
graph.set_xticklabels(graph.get_xticklabels(), rotation=30)
for p in graph.patches:
height = p.get_height()
graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
df['Skill Moves'].value_counts()
2.0 8613 3.0 6600 1.0 2026 4.0 917 5.0 51 Name: Skill Moves, dtype: int64
fig, ax = plt.subplots(figsize=(12,8))
graph = sns.countplot(ax=ax,x=df['Skill Moves'], data=df, palette = 'PuBuGn_d')
graph.set_title('Skill Moves of the Players', fontsize = 20)
graph.set_xticklabels(graph.get_xticklabels(), rotation=30)
for p in graph.patches:
height = p.get_height()
graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
fig, ax = plt.subplots(figsize=(12,8))
graph = sns.countplot(ax=ax,x=df['Skill Moves'], data=df, hue='Preferred Foot', palette = 'PuBuGn_d')
graph.set_title('Skill Moves of Players segregated by Preferred Foot' , fontsize = 20)
graph.set_xticklabels(graph.get_xticklabels(), rotation=30)
for p in graph.patches:
height = p.get_height()
graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
#Special Score of the Players
x = df['Special']
plt.figure(figsize=(18,10))
ax = sns.distplot(x)
ax.set_xlabel(xlabel = "Player's Special Scores", fontsize = 16)
ax.set_ylabel(ylabel = 'Number of Players', fontsize = 16)
ax.set_title(label = 'Distribution of Players Special Scores', fontsize = 20)
plt.show()
# different positions acquired by the players
df['Position'].value_counts()
ST 2212 GK 2025 CB 1778 CM 1394 LB 1322 RB 1291 RM 1124 LM 1095 CAM 958 CDM 948 RCB 662 LCB 648 LCM 395 RCM 391 LW 381 RW 370 RDM 248 LDM 243 LS 207 RS 203 RWB 87 LWB 78 CF 74 LAM 21 RAM 21 RF 16 LF 15 Name: Position, dtype: int64
df['Nationality'].nunique()
164
top_countries = df['Nationality'].value_counts().head(10)
top_countries
England 1662 Germany 1198 Spain 1072 Argentina 937 France 914 Brazil 827 Italy 702 Colombia 618 Japan 478 Netherlands 453 Name: Nationality, dtype: int64
#A word cloud of nationalities will help understand which nationalities are dominating.
#For doing that we need to join all the nationalities and then make a word cloud.
nationality = " ".join(n for n in df['Nationality'])
from wordcloud import WordCloud
plt.figure(figsize=(10, 10))
wc = WordCloud().generate(nationality)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
fig, ax = plt.subplots(figsize=(12,8))
x = top_countries.values
y = top_countries.index
ax.barh(y, x, align='center', color='green')
ax.invert_yaxis() # labels read top-to-bottom
ax.set_xlabel('Number of Players')
ax.set_ylabel('Name of Countries', rotation=0)
ax.set_title('Top 10 Countries with most number of players')
plt.show()
top_countries_name = top_countries.index
top_countries_name
Index(['England', 'Germany', 'Spain', 'Argentina', 'France', 'Brazil', 'Italy',
'Colombia', 'Japan', 'Netherlands'],
dtype='object')
#Age distribution from top countries
df_country_age = df.loc[df['Nationality'].isin(top_countries_name) & df['Age']]
plt.figure(1 , figsize = (12,6))
sns.boxplot(x = 'Nationality' , y = 'Age' , data = df_country_age, palette='rocket')
plt.title('Age Distribution in top countries')
plt.xticks(rotation = 50)
plt.show()
#Overall Rating
df_country_rating = df.loc[df['Nationality'].isin(top_countries_name) & df['Overall']]
plt.figure(1 , figsize = (12,6))
sns.barplot(x = 'Nationality' , y = 'Overall' , data = df_country_rating, palette='spring')
plt.title('Overall Rating Distribution of Players in top countries')
plt.xticks(rotation = 50)
plt.show()
#Potential Rating
df_country_rating = df.loc[df['Nationality'].isin(top_countries_name) & df['Potential']]
plt.figure(1 , figsize = (12,6))
sns.barplot(x = 'Nationality' , y = 'Potential' , data = df_country_rating, palette='PuBuGn_d')
plt.title('Potential Rating Distribution of Players in top countries')
plt.xticks(rotation = 50)
plt.show()
#Analyse players based on club
df['Club'].nunique()
652
df['Club'].unique()
array(['FC Barcelona', 'Juventus', 'Paris Saint-Germain',
'Manchester United', 'Manchester City', 'Chelsea', 'Real Madrid',
'Atlético Madrid', 'FC Bayern München', 'Tottenham Hotspur',
'Liverpool', 'Napoli', 'Arsenal', 'Milan', 'Inter', 'Lazio',
'Borussia Dortmund', 'Vissel Kobe', 'Olympique Lyonnais', 'Roma',
'Valencia CF', 'Guangzhou Evergrande Taobao FC', 'FC Porto',
'FC Schalke 04', 'Beşiktaş JK', 'LA Galaxy', 'Sporting CP',
'Real Betis', 'Olympique de Marseille', 'RC Celta',
'Bayer 04 Leverkusen', 'Real Sociedad', 'Villarreal CF',
'Sevilla FC', 'SL Benfica', 'AS Saint-Étienne', 'AS Monaco',
'Leicester City', 'Atalanta', 'Grêmio', 'Atlético Mineiro',
'RB Leipzig', 'Ajax', 'Dalian YiFang FC', 'Everton',
'West Ham United', '1. FC Köln', 'TSG 1899 Hoffenheim',
'Shanghai SIPG FC', 'OGC Nice', 'Al Nassr',
'Wolverhampton Wanderers', 'Borussia Mönchengladbach',
'Hertha BSC', 'SV Werder Bremen', 'Cruzeiro',
'Athletic Club de Bilbao', 'Torino', 'Medipol Başakşehir FK',
'Beijing Sinobo Guoan FC', 'Crystal Palace', 'PFC CSKA Moscow',
'VfL Wolfsburg', 'Shakhtar Donetsk', 'Toronto FC',
'Lokomotiv Moscow', 'Sassuolo', 'New York City FC', 'Fluminense',
'PSV', 'Levante UD', 'Fulham', 'Watford', 'Atlanta United',
'Montpellier HSC', 'Galatasaray SK', 'Fenerbahçe SK', 'SD Eibar',
'Los Angeles FC', 'Sampdoria', 'Al Hilal', 'VfB Stuttgart',
'SC Braga', 'River Plate', 'Deportivo Alavés', 'No Club',
'Eintracht Frankfurt', 'Girona FC', 'Guangzhou R&F; FC', 'Burnley',
'Stoke City', 'Southampton', 'Tianjin Quanjian FC', 'Getafe CF',
'Beijing Renhe FC', 'Montreal Impact', 'Chievo Verona', 'Genoa',
'Portland Timbers', 'Tigres U.A.N.L.', 'RCD Espanyol',
'Hebei China Fortune FC', 'Cagliari', 'Chicago Fire', 'DC United',
'Sagan Tosu', 'Dynamo Kyiv', 'Santos', 'Internacional',
'América FC (Minas Gerais)', 'Independiente', 'Boca Juniors',
'Cruz Azul', '1. FSV Mainz 05', 'Bournemouth', 'Spartak Moscow',
'Racing Club', 'FC Augsburg', 'Fiorentina', 'FC Nantes',
'Feyenoord', 'Club Brugge KV', 'Brighton & Hove Albion', 'Al Ahli',
'Jiangsu Suning FC', 'SC Freiburg', 'PAOK', 'Stade Rennais FC',
'Trabzonspor', 'SPAL', 'Portimonense SC', 'Olympiacos CFP',
'Club Atlético Huracán', 'Kasimpaşa SK', 'Newcastle United',
'Frosinone', 'Querétaro', 'KRC Genk', 'Hannover 96',
'Stade Malherbe Caen', 'Godoy Cruz', 'Toulouse Football Club',
'RSC Anderlecht', 'Huddersfield Town', 'CD Tondela',
'Seattle Sounders FC', 'Hamburger SV', 'FC Red Bull Salzburg',
'Rio Ave FC', 'FC Girondins de Bordeaux', 'Melbourne Victory',
'Parma', 'FC Basel 1893', 'Al Wehda', 'BSC Young Boys', 'KAA Gent',
'Al Ittihad', 'Standard de Liège', 'Shanghai Greenland Shenhua FC',
'Colo-Colo', 'Junior FC', 'West Bromwich Albion',
'RC Strasbourg Alsace', 'Göztepe SK', 'Deportivo Cali',
'Deportivo Toluca', 'Bologna', 'Nagoya Grampus', 'Amiens SC',
'Changchun Yatai FC', 'Club Atlético Lanús', 'Botafogo',
'Club América', 'Udinese', 'Real Valladolid CF', 'CD Leganés',
'Club Atlético Banfield', 'Celtic', 'Vitória Guimarães',
'FC København', 'UD Las Palmas', 'Deportivo de La Coruña',
'Universidad Católica', 'San Lorenzo de Almagro', 'Rayo Vallecano',
'Monterrey', 'Columbus Crew SC', 'MKE Ankaragücü',
'Guizhou Hengfeng FC', 'Swansea City', 'Tianjin TEDA FC',
'Chongqing Dangdai Lifan FC SWM Team', 'AEK Athens', 'Al Taawoun',
'Melbourne City FC', 'En Avant de Guingamp',
'Akhisar Belediyespor', 'Foggia', 'LOSC Lille', '1. FC Nürnberg',
'Clube Sport Marítimo', 'Real Sporting de Gijón', 'BB Erzurumspor',
'Shandong Luneng TaiShan FC', 'Club Atlético Colón', 'Bahia',
'Once Caldas', 'FC Groningen', 'Angers SCO', 'Paraná',
'Antalyaspor', 'Minnesota United FC', 'Club León', 'Empoli',
'VVV-Venlo', 'Leeds United', 'Viktoria Plzeň', 'Alanyaspor',
'Atlético Paranaense', 'Derby County', 'Kawasaki Frontale',
'Cardiff City', 'Aston Villa', 'Guadalajara', 'Dijon FCO',
'Santos Laguna', 'Málaga CF', 'Vitória', 'Çaykur Rizespor',
'U.N.A.M.', 'Nottingham Forest', 'Royal Antwerp FC',
'Club Tijuana', 'Sport Club do Recife', 'Real Salt Lake',
'AZ Alkmaar', 'SK Slavia Praha', 'Willem II', 'Middlesbrough',
'Dinamo Zagreb', 'Club Atlas', 'Granada CF', 'Sydney FC',
'Sporting Kansas City', 'SV Zulte-Waregem', 'Philadelphia Union',
'Real Oviedo', 'Pachuca', 'Boavista FC', 'Atiker Konyaspor',
'Kaizer Chiefs', 'GD Chaves', 'Palermo', 'Atlético Nacional',
'Puebla FC', 'Perth Glory', 'Panathinaikos FC', 'FC Sion',
'Vitória de Setúbal', 'New York Red Bulls', 'Al Shabab',
'Monarcas Morelia', 'Albacete BP', 'Rangers FC', 'Sparta Praha',
'Legia Warszawa', 'Urawa Red Diamonds', 'Rosario Central',
'Stade de Reims', 'ADO Den Haag', 'Chapecoense', 'FC Midtjylland',
'San Jose Earthquakes', 'Belgrano de Córdoba', 'Brescia',
'Kashima Antlers', 'CD Everton de Viña del Mar',
'Fortuna Düsseldorf', 'SD Huesca', 'Preston North End',
'Club Atlético Talleres', 'Benevento', 'Vitesse',
'Gimnasia y Esgrima La Plata', 'Houston Dynamo', 'Club Necaxa',
'Norwich City', 'Holstein Kiel', 'Ettifaq FC', 'Kayserispor',
'1. FC Heidenheim 1846', 'Brentford', 'Yeni Malatyaspor',
'Lobos BUAP', 'Bursaspor', 'Ceará Sporting Club',
'Sheffield United', 'FC Ingolstadt 04', 'Estudiantes de La Plata',
'AIK', 'Queens Park Rangers', 'Suwon Samsung Bluewings',
'Heart of Midlothian', 'Reading', 'FC Dallas', 'Heracles Almelo',
'Venezia FC', 'CD Lugo', 'Henan Jianye FC', 'Orlando City SC',
'CA Osasuna', 'NAC Breda', 'Livorno', 'Universidad de Chile',
'Brøndby IF', 'Aberdeen', 'Defensa y Justicia', 'Atlético Tucumán',
'Blackburn Rovers', 'SV Darmstadt 98', 'Moreirense FC',
'Sanfrecce Hiroshima', 'CD Numancia', 'KV Oostende', 'FC Utrecht',
'Vancouver Whitecaps FC', 'Odense Boldklub', 'SC Heerenveen',
'Racing Club de Lens', 'Independiente Santa Fe',
'Sporting de Charleroi', 'Millonarios FC', 'Sheffield Wednesday',
'Perugia', 'Daegu FC', 'Vélez Sarsfield',
'Grasshopper Club Zürich', 'Sivasspor', 'Nîmes Olympique',
'Rosenborg BK', 'SK Sturm Graz', 'FC Metz',
'CD Universidad de Concepción', 'Hellas Verona', 'Brisbane Roar',
'CD Feirense', 'Hull City', 'Waasland-Beveren', 'Neuchâtel Xamax',
'Real Zaragoza', 'CD Aves', 'Millwall', 'Unión de Santa Fe',
'KAS Eupen', 'Cádiz CF', 'FC Tokyo', 'CD Tenerife',
'1. FC Union Berlin', 'Al Fayha', 'AJ Auxerre',
'Patriotas Boyacá FC', 'Molde FK', 'Bristol City', 'CD Nacional',
'Sporting Lokeren', 'FC St. Pauli', 'Deportes Iquique',
'Al Qadisiyah', 'Atlético Bucaramanga', 'Club Atlético Tigre',
'FK Austria Wien', 'Patronato', 'Malmö FF', 'Kashiwa Reysol',
'US Cremonese', 'VfL Bochum 1848', 'SK Rapid Wien',
'KSV Cercle Brugge', 'Rionegro Águilas', 'Gimnàstic de Tarragona',
'Lecce', 'Santa Clara', 'BK Häcken', 'New England Revolution',
'Orlando Pirates', 'Atlético Huila', 'Western Sydney Wanderers',
'Kalmar FF', 'Independiente Medellín', 'Fortuna Sittard',
'Lech Poznań', 'Djurgårdens IF', 'CF Reus Deportiu', 'SK Brann',
'Ulsan Hyundai FC', 'Sint-Truidense VV', 'Carpi', 'Al Fateh',
'Royal Excel Mouscron', 'AC Ajaccio', 'PEC Zwolle', 'Sunderland',
'Club Atlético Aldosivi', 'US Salernitana 1919', 'FC Lorient',
'Argentinos Juniors', 'AD Alcorcón', 'Crotone', 'Excelsior',
'KV Kortrijk', 'IFK Norrköping', 'Adelaide United',
'FC St. Gallen', 'Tiburones Rojos de Veracruz', 'CD Palestino',
'Jeju United FC', 'Deportes Tolima', 'Jeonbuk Hyundai Motors',
'Birmingham City', 'América de Cali', 'La Equidad', 'Spezia',
'Aalborg BK', 'Le Havre AC', 'Górnik Zabrze',
'Central Coast Mariners', 'Wigan Athletic',
'Jagiellonia Białystok', 'Cittadella', 'Hibernian', 'FC Lugano',
'San Martín de San Juan', 'Strømsgodset IF', 'Júbilo Iwata',
"Newell's Old Boys", 'Al Faisaly', 'Colorado Rapids',
'IF Elfsborg', 'SV Sandhausen', 'Al Batin', 'Stade Brestois 29',
'UD Almería', 'Gyeongnam FC', 'Yokohama F. Marinos', 'Kilmarnock',
'Pescara', 'Newcastle Jets', 'Córdoba CF', 'RCD Mallorca',
'Hammarby IF', 'Cerezo Osaka', 'KFC Uerdingen 05',
'Shimizu S-Pulse', 'MSV Duisburg', 'Os Belenenses',
'DSC Arminia Bielefeld', 'Ipswich Town', 'FC Seoul',
'Lechia Gdańsk', 'Gamba Osaka', 'CF Rayo Majadahonda', 'LASK Linz',
'Bolton Wanderers', 'Al Raed', 'Extremadura UD', 'SC Paderborn 07',
'Wellington Phoenix', 'Unión Española', 'Alianza Petrolera',
'Cracovia', 'Gangwon FC', 'Elche CF', 'ESTAC Troyes', 'AS Béziers',
'La Berrichonne de Châteauroux', 'Clermont Foot 63',
'1. FC Magdeburg', 'Pohang Steelers', 'Örebro SK', 'Arka Gdynia',
'SG Dynamo Dresden', 'SpVgg Greuther Fürth', 'CD Huachipato',
'Wisła Kraków', 'Stabæk Fotball', 'Eintracht Braunschweig',
'Valenciennes FC', 'FC Thun', 'San Luis de Quillota',
' SSV Jahn Regensburg', 'Cosenza', 'FC Nordsjælland',
'FC Erzgebirge Aue', 'Jeonnam Dragons', 'Wolfsberger AC',
'Chamois Niortais Football Club', 'Club Deportes Temuco',
'AS Nancy Lorraine', 'Red Star FC', 'Al Hazem', 'Pogoń Szczecin',
'Charlton Athletic', 'Grenoble Foot 38', 'FC Hansa Rostock',
'San Martin de Tucumán', 'Incheon United FC', 'Śląsk Wrocław',
'GFC Ajaccio', '1. FC Kaiserslautern', 'Deportivo Pasto',
'Lincoln City', 'Motherwell', 'Rotherham United', 'Burton Albion',
'Wisła Płock', 'FC Wacker Innsbruck', 'Peterborough United',
'Ascoli', 'FC Zürich', 'Fleetwood Town', 'Padova',
'FC Sochaux-Montbéliard', 'SV Wehen Wiesbaden', 'Unión La Calera',
'Scunthorpe United', "CD O'Higgins", 'CD Antofagasta',
'Plymouth Argyle', 'Aarhus GF', 'Lillestrøm SK', 'Karlsruher SC',
'GIF Sundsvall', 'FC Emmen', 'Barnsley', 'Audax Italiano',
'V-Varen Nagasaki', 'Paris FC', 'SpVgg Unterhaching', 'Hobro IK',
'De Graafschap', 'Hokkaido Consadole Sapporo', 'Tromsø IL',
'FC Luzern', 'FK Haugesund', 'Zagłębie Lubin', 'VfR Aalen',
'Dundalk', 'Oxford United', 'Piast Gliwice', 'Ohod Club',
'Östersunds FK', 'Vegalta Sendai', 'Crawley Town',
'FC Admira Wacker Mödling', 'Vålerenga Fotball', 'Dundee FC',
'Portsmouth', 'Envigado FC', 'Miedź Legnica', 'Odds BK',
'SC Fortuna Köln', 'US Orléans Loiret Football', 'Sarpsborg 08 FF',
'Jaguares de Córdoba', 'Bradford City', 'Accrington Stanley',
'St. Johnstone FC', 'Boyacá Chicó FC', 'Luton Town',
'SV Mattersburg', 'Kristiansund BK', 'Sangju Sangmu FC',
'Rochdale', 'Walsall', 'Korona Kielce', 'Shonan Bellmare',
'FC Würzburger Kickers', 'FSV Zwickau', 'St. Mirren', 'AC Horsens',
'Esbjerg fB', 'HJK Helsinki', 'Southend United', 'Bristol Rovers',
'Hamilton Academical FC', 'TSV 1860 München', 'Curicó Unido',
'SCR Altach', 'Ranheim Fotball', 'Stevenage',
'SG Sonnenhof Großaspach', 'Oldham Athletic', 'Milton Keynes Dons',
'FK Bodø/Glimt', 'SC Preußen Münster', 'Wycombe Wanderers',
'Vejle Boldklub', 'Bury', 'Randers FC', 'VfL Osnabrück',
'SønderjyskE', 'IFK Göteborg', 'Mansfield Town', 'Coventry City',
'Waterford FC', 'Shrewsbury', 'IK Start', 'Gillingham',
'FC Energie Cottbus', 'FC Carl Zeiss Jena', 'Hallescher FC',
'SV Meppen', 'AFC Wimbledon', 'Blackpool', 'Doncaster Rovers',
'Sandefjord Fotball', 'VfL Sportfreunde Lotte', 'Cheltenham Town',
'IK Sirius', 'Vendsyssel FF', 'Swindon Town', 'Notts County',
'SKN St. Pölten', 'Exeter City', 'Northampton Town',
'Shamrock Rovers', 'Colchester United', 'Livingston FC',
'TSV Hartberg', 'Tranmere Rovers', 'Cambridge United',
'Grimsby Town', 'Port Vale', 'Itagüí Leones FC',
'Forest Green Rovers', 'Dalkurd FF', 'Zagłębie Sosnowiec',
'Carlisle United', 'Trelleborgs FF', "St. Patrick's Athletic",
'Morecambe', 'Cork City', 'IF Brommapojkarna', 'Crewe Alexandra',
'Yeovil Town', 'Bohemian FC', 'Macclesfield Town',
'Newport County', 'Sligo Rovers', 'Derry City', 'Limerick FC',
'Bray Wanderers'], dtype=object)
clubs = ['FC Barcelona','Real Madrid','Juventus','Liverpool','Manchester United',
'Chelsea','Arsenal','Paris Saint-Germain' ,'FC Bayern München','Manchester City']
#Age distribution in famous clubs
df_club_age = df.loc[df['Club'].isin(clubs) & df['Age']]
plt.figure(1 , figsize = (12,6))
sns.boxplot(x = 'Club', y = 'Age' , data = df_club_age, palette='spring')
plt.title('Age Distribution in famous clubs')
plt.xticks(rotation = 50)
plt.show()
#Overall Rating in famous clubs
df_club_rating = df.loc[df['Club'].isin(clubs) & df['Overall']]
plt.figure(1 , figsize = (12,6))
sns.boxplot(x = 'Club' , y = 'Overall' , data = df_club_rating, palette='PuBuGn_d')
plt.title('Overall Rating Distribution in famous clubs')
plt.xticks(rotation = 50)
plt.show()
#The Best Clubs with Players Overall Rating
best_dict = {}
for club in df['Club'].unique():
overall_rating = df['Overall'][df['Club'] == club].sum()
best_dict[club] = overall_rating
best_club = pd.DataFrame.from_dict(best_dict, orient='index', columns = ['overall'])
best_club['club'] = best_club.index
best_club = best_club.sort_values(by = 'overall', ascending = False)
plt.figure(1 , figsize = (15 , 6))
sns.barplot(x ='club',y ='overall',data = best_club.head(10),palette='Reds')
plt.xticks(rotation = 70)
plt.xlabel("Club")
plt.ylabel('Sum of Overall Rating of players in club')
plt.title('Clubs with best Players (sum of overall ratings of players per club)')
plt.ylim(2450 , 2600)
plt.show()
#Profiling top players
#The Best Players
df_best_players = pd.DataFrame.copy(df.sort_values(by ='Overall',ascending = False ).head(10))
plt.figure(1,figsize = (12,6))
sns.barplot(x ='Name' , y = 'Overall' , data = df_best_players, palette='PuBuGn_d')
plt.ylim(85,95)
plt.show()
#The Highest Earners
def normalizing_wage(x):
if '€' in str(x) and 'M' in str(x):
c = str(x).replace('€' , '')
c = str(c).replace('M' , '')
c = float(c) * 1000000
else:
c = str(x).replace('€' , '')
c = str(c).replace('K' , '')
c = float(c) * 1000
return c
df['Normalized_Wage'] = df['Wage'].apply(lambda x : normalizing_wage(x))
df.sort_values(by = 'Normalized_Wage' , ascending = False)[['Name','Club','Nationality','Overall',
'Age','Normalized_Wage','Wage]].head(5)
| Name | Club | Nationality | Overall | Age | Normalized_Wage | Wage | |
|---|---|---|---|---|---|---|---|
| 0 | L. Messi | FC Barcelona | Argentina | 94 | 31 | 565000.0 | €565 |
| 7 | L. Suárez | FC Barcelona | Uruguay | 91 | 31 | 455000.0 | €455 |
| 6 | L. Modrić | Real Madrid | Croatia | 91 | 32 | 420000.0 | €420 |
| 1 | Cristiano Ronaldo | Juventus | Portugal | 94 | 33 | 405000.0 | €405 |
| 8 | Sergio Ramos | Real Madrid | Spain | 91 | 32 | 380000.0 | €380 |
#The Eldest Players
df.sort_values(by = 'Age' , ascending = False)[['Name','Club','Nationality','Overall', 'Age' ]].head()
| Name | Club | Nationality | Overall | Age | |
|---|---|---|---|---|---|
| 4741 | O. Pérez | Pachuca | Mexico | 71 | 45 |
| 18183 | K. Pilkington | Cambridge United | England | 48 | 44 |
| 17726 | T. Warner | Accrington Stanley | Trinidad & Tobago | 53 | 44 |
| 10545 | S. Narazaki | Nagoya Grampus | Japan | 65 | 42 |
| 7225 | C. Muñoz | CD Universidad de Concepción | Argentina | 68 | 41 |
#The Youngest Players
df.sort_values(by = 'Age' , ascending = True)[['Name','Club','Nationality','Overall', 'Age' ]].head()
| Name | Club | Nationality | Overall | Age | |
|---|---|---|---|---|---|
| 18206 | G. Nugent | Tranmere Rovers | England | 46 | 16 |
| 17743 | J. Olstad | Sarpsborg 08 FF | Norway | 52 | 16 |
| 13293 | H. Massengo | AS Monaco | France | 62 | 16 |
| 16081 | J. Italiano | Perth Glory | Australia | 58 | 16 |
| 18166 | N. Ayéva | Örebro SK | Sweden | 48 | 16 |
#The Best Freekick Takers
df.sort_values(by = 'FKAccuracy' , ascending = False)[['Name','Club','Nationality','Age','FKAccuracy']].head()
| Name | Club | Nationality | Age | FKAccuracy | |
|---|---|---|---|---|---|
| 0 | L. Messi | FC Barcelona | Argentina | 31 | 94.0 |
| 293 | S. Giovinco | Toronto FC | Italy | 31 | 93.0 |
| 72 | M. Pjanić | Juventus | Bosnia Herzegovina | 28 | 92.0 |
| 1113 | E. Bardhi | Levante UD | FYR Macedonia | 22 | 91.0 |
| 90 | Parejo | Valencia CF | Spain | 29 | 90.0 |
#The Best Penalty Kick Taker
df.sort_values(by = 'Penalties' , ascending = False)[['Name','Club','Nationality','Age','Penalties']].head()
| Name | Club | Nationality | Age | Penalties | |
|---|---|---|---|---|---|
| 206 | M. Balotelli | OGC Nice | Italy | 27 | 92.0 |
| 118 | Fabinho | Liverpool | Brazil | 24 | 91.0 |
| 16 | H. Kane | Tottenham Hotspur | England | 24 | 90.0 |
| 823 | R. Jiménez | Wolverhampton Wanderers | Mexico | 27 | 90.0 |
| 945 | L. Baines | Everton | England | 33 | 90.0 |
#Best players with the Ball Control
df.sort_values(by = 'BallControl' , ascending = False)[['Name','Club','Nationality','Overall', 'Age','BallControl']].head()
| Name | Club | Nationality | Overall | Age | BallControl | |
|---|---|---|---|---|---|---|
| 0 | L. Messi | FC Barcelona | Argentina | 94 | 31 | 96.0 |
| 2 | Neymar Jr | Paris Saint-Germain | Brazil | 92 | 26 | 95.0 |
| 30 | Isco | Real Madrid | Spain | 88 | 26 | 95.0 |
| 13 | David Silva | Manchester City | Spain | 90 | 32 | 94.0 |
| 5 | E. Hazard | Chelsea | Belgium | 91 | 27 | 94.0 |
#Fastest Players
df.sort_values(by = 'SprintSpeed' , ascending False)[['Name','Club','Nationality','Overall', 'Age','SprintSpeed']].head()
| Name | Club | Nationality | Overall | Age | SprintSpeed | |
|---|---|---|---|---|---|---|
| 55 | L. Sané | Manchester City | Germany | 86 | 22 | 96.0 |
| 25 | K. Mbappé | Paris Saint-Germain | France | 88 | 19 | 96.0 |
| 1968 | Adama | Wolverhampton Wanderers | Spain | 75 | 22 | 96.0 |
| 36 | G. Bale | Real Madrid | Wales | 88 | 28 | 95.0 |
| 10928 | Maicon | Livorno | Brazil | 65 | 25 | 95.0 |
#The Best Dribbler
df.sort_values(by = 'Dribbling' , ascending = False)[['Name','Club','Nationality','Overall', 'Age','Dribbling']].head()
| Name | Club | Nationality | Overall | Age | Dribbling | |
|---|---|---|---|---|---|---|
| 0 | L. Messi | FC Barcelona | Argentina | 94 | 31 | 97.0 |
| 2 | Neymar Jr | Paris Saint-Germain | Brazil | 92 | 26 | 96.0 |
| 5 | E. Hazard | Chelsea | Belgium | 91 | 27 | 95.0 |
| 30 | Isco | Real Madrid | Spain | 88 | 26 | 94.0 |
| 94 | Y. Brahimi | FC Porto | Algeria | 85 | 28 | 93.0 |
#The Best Finisher
df.sort_values(by = 'Finishing' , ascending = False)[['Name','Club','Nationality','Overall', 'Age','Finishing']].head()
| Name | Club | Nationality | Overall | Age | Finishing | |
|---|---|---|---|---|---|---|
| 0 | L. Messi | FC Barcelona | Argentina | 94 | 31 | 95.0 |
| 16 | H. Kane | Tottenham Hotspur | England | 89 | 24 | 94.0 |
| 1 | Cristiano Ronaldo | Juventus | Portugal | 94 | 33 | 94.0 |
| 7 | L. Suárez | FC Barcelona | Uruguay | 91 | 31 | 93.0 |
| 23 | S. Agüero | Manchester City | Argentina | 89 | 30 | 93.0 |
#Distribution of weak foot
df['Weak Foot'].value_counts()
3.0 11397 2.0 3761 4.0 2662 5.0 229 1.0 158 Name: Weak Foot, dtype: int64
labels = df['Weak Foot'].value_counts().index
size = df['Weak Foot'].value_counts()
colors=['cyan','pink','orange','lightgreen','yellow']
explode = [0, 0.1, 0.1, 0, 0]
plt.pie(size, labels = labels, colors = colors, explode = explode, shadow = True, autopct='%1.1f%%',startangle = 90)
plt.title('Distribution of Weak Foot among players', fontsize = 20)
plt.legend()
plt.show()
df['Preferred Foot'].value_counts()/len(df)
Right 0.768715 Left 0.231285 Name: Preferred Foot, dtype: float64
labels = df['Preferred Foot'].value_counts().index
size = df['Preferred Foot'].value_counts()
colors=['cyan','pink']
plt.pie(size, labels = labels, colors = colors, shadow = True, autopct='%1.1f%%',startangle = 90)
plt.title('Distribution of Preferred Foot among players', fontsize = 20)
plt.legend()
plt.show()
#Segregation of Indian Players
def country(x):
return df[df['Nationality'] == x].head()
# prepare dataset for Indian players
country('India')
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8605 | 8605 | 190939 | S. Chhetri | 33 | https://cdn.sofifa.org/players/4/19/190939.png | India | https://cdn.sofifa.org/flags/159.png | 67 | 67 | No Club | ... | 30.0 | 24.0 | 44.0 | 10.0 | 7.0 | 11.0 | 9.0 | 10.0 | 0 | 0.0 |
| 10011 | 10011 | 223763 | S. Jhingan | 24 | https://cdn.sofifa.org/players/4/19/223763.png | India | https://cdn.sofifa.org/flags/159.png | 65 | 71 | No Club | ... | 64.0 | 61.0 | 60.0 | 13.0 | 11.0 | 7.0 | 11.0 | 12.0 | 0 | 0.0 |
| 12598 | 12598 | 217225 | J. Lalpekhlua | 27 | https://cdn.sofifa.org/players/4/19/217225.png | India | https://cdn.sofifa.org/flags/159.png | 63 | 64 | No Club | ... | 28.0 | 31.0 | 29.0 | 13.0 | 11.0 | 10.0 | 10.0 | 11.0 | 0 | 0.0 |
| 12811 | 12811 | 225213 | G. Singh Sandhu | 26 | https://cdn.sofifa.org/players/4/19/225213.png | India | https://cdn.sofifa.org/flags/159.png | 63 | 68 | No Club | ... | 19.0 | 15.0 | 11.0 | 63.0 | 59.0 | 59.0 | 62.0 | 64.0 | 0 | 0.0 |
| 13508 | 13508 | 238205 | A. Edathodika | 31 | https://cdn.sofifa.org/players/4/19/238205.png | India | https://cdn.sofifa.org/flags/159.png | 62 | 62 | No Club | ... | 67.0 | 62.0 | 68.0 | 14.0 | 15.0 | 14.0 | 11.0 | 7.0 | 0 | 0.0 |
5 rows × 90 columns
sns.lineplot(data=df, x="Age", y="Overall")
<Axes: xlabel='Age', ylabel='Overall'>
sns.lineplot(data=df, x="Age", y="Overall")
<Axes: xlabel='Age', ylabel='Overall'>
df['Nationality'].value_counts()[0:10]
England 1662 Germany 1198 Spain 1072 Argentina 937 France 914 Brazil 827 Italy 702 Colombia 618 Japan 478 Netherlands 453 Name: Nationality, dtype: int64
import matplotlib.pyplot as plt
# Create a figure
plt.figure(figsize=(8, 5))
# Select the top 5 Nationalities and create a bar plot
top_nationalities = df['Nationality'].value_counts().head(5)
plt.bar(top_nationalities.index, top_nationalities, color="g")
# Show the plot
plt.show()
#Finding Out Which Player Gets The Highest Wages.
player_salary = df[['Name' , 'Wage']]
player_salary.head()
| Name | Wage | |
|---|---|---|
| 0 | L. Messi | €565 |
| 1 | Cristiano Ronaldo | €405 |
| 2 | Neymar Jr | €290 |
| 3 | De Gea | €260 |
| 4 | K. De Bruyne | €355 |
import matplotlib.pyplot as plt
# Create a figure with a specified size
plt.figure(figsize=(8, 5))
# Define the data you want to plot as regular Python lists
players = player_salary['Name'][0:5]
wages = player_salary['Wage'][0:5]
# Create a bar chart
plt.bar(players, wages, color="red")
# Display the chart
plt.show()
#Germany
Germany = df[df['Nationality'] == 'Germany']
Germany.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11 | 11 | 182521 | T. Kroos | 28 | https://cdn.sofifa.org/players/4/19/182521.png | Germany | https://cdn.sofifa.org/flags/21.png | 90 | 90 | Real Madrid | ... | 72.0 | 79.0 | 69.0 | 10.0 | 11.0 | 13.0 | 7.0 | 10.0 | €156.8 | 355000.0 |
| 18 | 18 | 192448 | M. ter Stegen | 26 | https://cdn.sofifa.org/players/4/19/192448.png | Germany | https://cdn.sofifa.org/flags/21.png | 89 | 92 | FC Barcelona | ... | 25.0 | 13.0 | 10.0 | 87.0 | 85.0 | 88.0 | 85.0 | 90.0 | €123.3 | 240000.0 |
| 22 | 22 | 167495 | M. Neuer | 32 | https://cdn.sofifa.org/players/4/19/167495.png | Germany | https://cdn.sofifa.org/flags/21.png | 89 | 89 | FC Bayern München | ... | 17.0 | 10.0 | 11.0 | 90.0 | 86.0 | 91.0 | 87.0 | 87.0 | €62.7 | 130000.0 |
| 34 | 34 | 178603 | M. Hummels | 29 | https://cdn.sofifa.org/players/4/19/178603.png | Germany | https://cdn.sofifa.org/flags/21.png | 88 | 88 | FC Bayern München | ... | 88.0 | 90.0 | 88.0 | 15.0 | 6.0 | 10.0 | 5.0 | 6.0 | €75.9 | 160000.0 |
| 55 | 55 | 222492 | L. Sané | 22 | https://cdn.sofifa.org/players/4/19/222492.png | Germany | https://cdn.sofifa.org/flags/21.png | 86 | 92 | Manchester City | ... | 36.0 | 32.0 | 35.0 | 8.0 | 12.0 | 9.0 | 9.0 | 14.0 | €125.1 | 195000.0 |
5 rows × 90 columns
#Finding Out Who Is The Tallest German Player Or The Player Belongs To German Nationality
Germany.sort_values(by=[ 'Height'], ascending=False).head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 17786 | 17786 | 240218 | A. Weidinger | 21 | https://cdn.sofifa.org/players/4/19/240218.png | Germany | https://cdn.sofifa.org/flags/21.png | 52 | 60 | SSV Jahn Regensburg | ... | 14.0 | 13.0 | 11.0 | 51.0 | 54.0 | 56.0 | 56.0 | 49.0 | €105K | 1000.0 |
| 7785 | 7785 | 236831 | A. Seydel | 22 | https://cdn.sofifa.org/players/4/19/236831.png | Germany | https://cdn.sofifa.org/flags/21.png | 67 | 76 | Holstein Kiel | ... | 25.0 | 28.0 | 22.0 | 12.0 | 9.0 | 14.0 | 11.0 | 9.0 | 0 | 9000.0 |
| 13520 | 13520 | 239746 | L. Watkowiak | 22 | https://cdn.sofifa.org/players/4/19/239746.png | Germany | https://cdn.sofifa.org/flags/21.png | 62 | 68 | SV Wehen Wiesbaden | ... | 7.0 | 13.0 | 13.0 | 65.0 | 59.0 | 64.0 | 60.0 | 66.0 | €495K | 1000.0 |
| 4542 | 4542 | 158657 | T. Kessler | 32 | https://cdn.sofifa.org/players/4/19/158657.png | Germany | https://cdn.sofifa.org/flags/21.png | 71 | 71 | 1. FC Köln | ... | 12.0 | 14.0 | 15.0 | 72.0 | 69.0 | 67.0 | 70.0 | 71.0 | €2 | 10000.0 |
| 1426 | 1426 | 199833 | L. Unnerstall | 27 | https://cdn.sofifa.org/players/4/19/199833.png | Germany | https://cdn.sofifa.org/flags/21.png | 76 | 78 | VVV-Venlo | ... | 11.0 | 17.0 | 15.0 | 76.0 | 75.0 | 72.0 | 74.0 | 77.0 | 0 | 12000.0 |
5 rows × 90 columns
#Finding Out The German Player Who Has The Highest Weight
Germany.sort_values(by=[ 'Weight'], ascending=False).head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13520 | 13520 | 239746 | L. Watkowiak | 22 | https://cdn.sofifa.org/players/4/19/239746.png | Germany | https://cdn.sofifa.org/flags/21.png | 62 | 68 | SV Wehen Wiesbaden | ... | 7.0 | 13.0 | 13.0 | 65.0 | 59.0 | 64.0 | 60.0 | 66.0 | €495K | 1000.0 |
| 1426 | 1426 | 199833 | L. Unnerstall | 27 | https://cdn.sofifa.org/players/4/19/199833.png | Germany | https://cdn.sofifa.org/flags/21.png | 76 | 78 | VVV-Venlo | ... | 11.0 | 17.0 | 15.0 | 76.0 | 75.0 | 72.0 | 74.0 | 77.0 | 0 | 12000.0 |
| 210 | 210 | 179783 | R. Fährmann | 29 | https://cdn.sofifa.org/players/4/19/179783.png | Germany | https://cdn.sofifa.org/flags/21.png | 83 | 84 | FC Schalke 04 | ... | 10.0 | 12.0 | 10.0 | 83.0 | 81.0 | 52.0 | 82.0 | 87.0 | €35.5 | 38000.0 |
| 165 | 165 | 213331 | J. Tah | 22 | https://cdn.sofifa.org/players/4/19/213331.png | Germany | https://cdn.sofifa.org/flags/21.png | 83 | 88 | Bayer 04 Leverkusen | ... | 80.0 | 88.0 | 84.0 | 11.0 | 8.0 | 7.0 | 9.0 | 14.0 | €52.4 | 67000.0 |
| 1225 | 1225 | 200212 | M. Esser | 30 | https://cdn.sofifa.org/players/4/19/200212.png | Germany | https://cdn.sofifa.org/flags/21.png | 76 | 76 | Hannover 96 | ... | 20.0 | 15.0 | 16.0 | 76.0 | 74.0 | 69.0 | 75.0 | 78.0 | €8.9 | 24000.0 |
5 rows × 90 columns
#Finding Out Who Are The Top Most Earning German Players.
Germany.sort_values(by=[ 'Wage'], ascending=False).head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 82 | 82 | 212622 | J. Kimmich | 23 | https://cdn.sofifa.org/players/4/19/212622.png | Germany | https://cdn.sofifa.org/flags/21.png | 85 | 88 | FC Bayern München | ... | 75.0 | 81.0 | 79.0 | 8.0 | 15.0 | 7.0 | 15.0 | 15.0 | €69.9 | 92000.0 |
| 184 | 184 | 202166 | J. Draxler | 24 | https://cdn.sofifa.org/players/4/19/202166.png | Germany | https://cdn.sofifa.org/flags/21.png | 83 | 86 | Paris Saint-Germain | ... | 39.0 | 64.0 | 44.0 | 11.0 | 13.0 | 5.0 | 13.0 | 8.0 | €62.6 | 91000.0 |
| 448 | 448 | 208333 | E. Can | 24 | https://cdn.sofifa.org/players/4/19/208333.png | Germany | https://cdn.sofifa.org/flags/21.png | 80 | 85 | Juventus | ... | 82.0 | 81.0 | 80.0 | 14.0 | 8.0 | 8.0 | 13.0 | 11.0 | €33.6 | 91000.0 |
| 5613 | 5613 | 238072 | E. Löwen | 21 | https://cdn.sofifa.org/players/4/19/238072.png | Germany | https://cdn.sofifa.org/flags/21.png | 70 | 80 | 1. FC Nürnberg | ... | 61.0 | 66.0 | 60.0 | 5.0 | 10.0 | 5.0 | 12.0 | 13.0 | €5 | 9000.0 |
| 8607 | 8607 | 158172 | M. Parensen | 32 | https://cdn.sofifa.org/players/4/19/158172.png | Germany | https://cdn.sofifa.org/flags/21.png | 67 | 67 | 1. FC Union Berlin | ... | 66.0 | 63.0 | 65.0 | 7.0 | 15.0 | 7.0 | 9.0 | 9.0 | €680K | 9000.0 |
5 rows × 90 columns
#Columns with NaN null values
df.columns[df.isna().any()]
Index([], dtype='object')
#checking for NaN values
df.isna().any().any()
False
#Players who are goalkeepers
strickers_df=df[['Name','Position','Nationality','Club','Overall','Potential']]
strickers_df=strickers_df[strickers_df['Position'].str.contains(pat='GK')]
strickers_df.sort_values('Overall',ascending=False).head(20)
| Name | Position | Nationality | Club | Overall | Potential | |
|---|---|---|---|---|---|---|
| 3 | De Gea | GK | Spain | Manchester United | 91 | 93 |
| 9 | J. Oblak | GK | Slovenia | Atlético Madrid | 90 | 93 |
| 18 | M. ter Stegen | GK | Germany | FC Barcelona | 89 | 92 |
| 19 | T. Courtois | GK | Belgium | Real Madrid | 89 | 90 |
| 22 | M. Neuer | GK | Germany | FC Bayern München | 89 | 89 |
| 37 | H. Lloris | GK | France | Tottenham Hotspur | 88 | 88 |
| 40 | S. Handanovič | GK | Slovenia | Inter | 88 | 88 |
| 41 | G. Buffon | GK | Italy | Paris Saint-Germain | 88 | 88 |
| 46 | K. Navas | GK | Costa Rica | Real Madrid | 87 | 87 |
| 57 | Ederson | GK | Brazil | Manchester City | 86 | 90 |
| 81 | Alisson | GK | Brazil | Liverpool | 85 | 90 |
| 92 | W. Szczęsny | GK | Poland | Juventus | 85 | 87 |
| 133 | L. Hrádecký | GK | Finland | Bayer 04 Leverkusen | 84 | 84 |
| 147 | S. Ruffier | GK | France | AS Saint-Étienne | 84 | 84 |
| 141 | Sergio Asenjo | GK | Spain | Villarreal CF | 84 | 85 |
| 149 | K. Schmeichel | GK | Denmark | Leicester City | 84 | 84 |
| 131 | B. Leno | GK | Germany | Arsenal | 84 | 87 |
| 126 | A. Lopes | GK | Portugal | Olympique Lyonnais | 84 | 86 |
| 128 | M. Perin | GK | Italy | Juventus | 84 | 89 |
| 210 | R. Fährmann | GK | Germany | FC Schalke 04 | 83 | 84 |
#Let's check how many players are here only from England
eng_players_df=df[df.Nationality=='England']
eng_players_df
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 16 | 16 | 202126 | H. Kane | 24 | https://cdn.sofifa.org/players/4/19/202126.png | England | https://cdn.sofifa.org/flags/14.png | 89 | 91 | Tottenham Hotspur | ... | 56.0 | 36.0 | 38.0 | 8.0 | 10.0 | 11.0 | 14.0 | 11.0 | €160.7 | 205000.0 |
| 60 | 60 | 202652 | R. Sterling | 23 | https://cdn.sofifa.org/players/4/19/202652.png | England | https://cdn.sofifa.org/flags/14.png | 86 | 89 | Manchester City | ... | 47.0 | 58.0 | 54.0 | 15.0 | 12.0 | 12.0 | 15.0 | 9.0 | €108.8 | 195000.0 |
| 117 | 117 | 211117 | D. Alli | 22 | https://cdn.sofifa.org/players/4/19/211117.png | England | https://cdn.sofifa.org/flags/14.png | 84 | 90 | Tottenham Hotspur | ... | 70.0 | 70.0 | 63.0 | 7.0 | 6.0 | 9.0 | 11.0 | 8.0 | €87.1 | 115000.0 |
| 135 | 135 | 188377 | K. Walker | 28 | https://cdn.sofifa.org/players/4/19/188377.png | England | https://cdn.sofifa.org/flags/14.png | 84 | 84 | Manchester City | ... | 78.0 | 84.0 | 83.0 | 12.0 | 6.0 | 16.0 | 15.0 | 8.0 | €45.3 | 165000.0 |
| 180 | 180 | 204935 | J. Pickford | 24 | https://cdn.sofifa.org/players/4/19/204935.png | England | https://cdn.sofifa.org/flags/14.png | 83 | 88 | Everton | ... | 16.0 | 20.0 | 12.0 | 83.0 | 78.0 | 88.0 | 81.0 | 86.0 | €49.4 | 78000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18198 | 18198 | 242844 | J. Livesey | 18 | https://cdn.sofifa.org/players/4/19/242844.png | England | https://cdn.sofifa.org/flags/14.png | 47 | 70 | Burton Albion | ... | 15.0 | 11.0 | 13.0 | 46.0 | 52.0 | 58.0 | 42.0 | 48.0 | €165K | 1000.0 |
| 18202 | 18202 | 238813 | J. Lundstram | 19 | https://cdn.sofifa.org/players/4/19/238813.png | England | https://cdn.sofifa.org/flags/14.png | 47 | 65 | Crewe Alexandra | ... | 40.0 | 48.0 | 47.0 | 10.0 | 13.0 | 7.0 | 8.0 | 9.0 | €143K | 1000.0 |
| 18204 | 18204 | 241638 | B. Worman | 16 | https://cdn.sofifa.org/players/4/19/241638.png | England | https://cdn.sofifa.org/flags/14.png | 47 | 67 | Cambridge United | ... | 32.0 | 13.0 | 11.0 | 6.0 | 5.0 | 10.0 | 6.0 | 13.0 | €165K | 1000.0 |
| 18205 | 18205 | 246268 | D. Walker-Rice | 17 | https://cdn.sofifa.org/players/4/19/246268.png | England | https://cdn.sofifa.org/flags/14.png | 47 | 66 | Tranmere Rovers | ... | 20.0 | 25.0 | 27.0 | 14.0 | 6.0 | 14.0 | 8.0 | 9.0 | €143K | 1000.0 |
| 18206 | 18206 | 246269 | G. Nugent | 16 | https://cdn.sofifa.org/players/4/19/246269.png | England | https://cdn.sofifa.org/flags/14.png | 46 | 66 | Tranmere Rovers | ... | 40.0 | 43.0 | 50.0 | 10.0 | 15.0 | 9.0 | 12.0 | 9.0 | €165K | 1000.0 |
1662 rows × 90 columns
print('Total number of players from England : {}'.format(eng_players_df.shape[0]))
Total number of players from England : 1662
clubs_df=df[['Club',]].drop_duplicates(subset='Club')
clubs_df
| Club | |
|---|---|
| 0 | FC Barcelona |
| 1 | Juventus |
| 2 | Paris Saint-Germain |
| 3 | Manchester United |
| 4 | Manchester City |
| ... | ... |
| 12006 | Newport County |
| 13054 | Sligo Rovers |
| 13369 | Derry City |
| 13683 | Limerick FC |
| 15950 | Bray Wanderers |
652 rows × 1 columns
print('Toal Number of clubs : {}'.format(clubs_df.shape[0]))
Toal Number of clubs : 652
#Top 10 positions of maxmium players
df.Position.value_counts().head(10)
ST 2212 GK 2025 CB 1778 CM 1394 LB 1322 RB 1291 RM 1124 LM 1095 CAM 958 CDM 948 Name: Position, dtype: int64
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
#matplotlib.rcParams['font.size'] = 14
#matplotlib.rcParams['figure.figsize'] = (20, 10)
#matplotlib.rcParams['figure.facecolor'] = '#00000000'
#Clustering Players by Overall and Potential Rating
player_highest_overall_df=df[['Name','Overall','Potential','Age','Nationality','Club','Preferred Foot']].copy()
player_highest_overall_df=player_highest_overall_df.sort_values('Overall',ascending=False)
player_highest_overall_df
| Name | Overall | Potential | Age | Nationality | Club | Preferred Foot | |
|---|---|---|---|---|---|---|---|
| 0 | L. Messi | 94 | 94 | 31 | Argentina | FC Barcelona | Left |
| 1 | Cristiano Ronaldo | 94 | 94 | 33 | Portugal | Juventus | Right |
| 2 | Neymar Jr | 92 | 93 | 26 | Brazil | Paris Saint-Germain | Right |
| 3 | De Gea | 91 | 93 | 27 | Spain | Manchester United | Right |
| 4 | K. De Bruyne | 91 | 92 | 27 | Belgium | Manchester City | Right |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 18190 | L. Watkins | 47 | 67 | 18 | England | Cambridge United | Right |
| 18189 | A. Kaltner | 47 | 61 | 18 | Germany | SpVgg Unterhaching | Right |
| 18187 | C. Ehlich | 47 | 59 | 19 | Germany | SpVgg Unterhaching | Right |
| 18186 | Zhang Yufeng | 47 | 64 | 20 | China PR | Beijing Renhe FC | Right |
| 18206 | G. Nugent | 46 | 66 | 16 | England | Tranmere Rovers | Right |
18207 rows × 7 columns
#Top 10 players with highest overall rating
top_10_overall_df=player_highest_overall_df.head(10)
top_10_overall_df
| Name | Overall | Potential | Age | Nationality | Club | Preferred Foot | |
|---|---|---|---|---|---|---|---|
| 0 | L. Messi | 94 | 94 | 31 | Argentina | FC Barcelona | Left |
| 1 | Cristiano Ronaldo | 94 | 94 | 33 | Portugal | Juventus | Right |
| 2 | Neymar Jr | 92 | 93 | 26 | Brazil | Paris Saint-Germain | Right |
| 3 | De Gea | 91 | 93 | 27 | Spain | Manchester United | Right |
| 4 | K. De Bruyne | 91 | 92 | 27 | Belgium | Manchester City | Right |
| 5 | E. Hazard | 91 | 91 | 27 | Belgium | Chelsea | Right |
| 6 | L. Modrić | 91 | 91 | 32 | Croatia | Real Madrid | Right |
| 7 | L. Suárez | 91 | 91 | 31 | Uruguay | FC Barcelona | Right |
| 8 | Sergio Ramos | 91 | 91 | 32 | Spain | Real Madrid | Right |
| 12 | D. Godín | 90 | 90 | 32 | Uruguay | Atlético Madrid | Right |
player_highest_potential_df=df[['Name','Potential','Overall','Age','Nationality','Club','Preferred Foot']].copy()
player_highest_potential_df=player_highest_potential_df.sort_values('Potential',ascending=False)
player_highest_potential_df
| Name | Potential | Overall | Age | Nationality | Club | Preferred Foot | |
|---|---|---|---|---|---|---|---|
| 25 | K. Mbappé | 95 | 88 | 19 | France | Paris Saint-Germain | Right |
| 0 | L. Messi | 94 | 94 | 31 | Argentina | FC Barcelona | Left |
| 15 | P. Dybala | 94 | 89 | 24 | Argentina | Juventus | Left |
| 1 | Cristiano Ronaldo | 94 | 94 | 33 | Portugal | Juventus | Right |
| 2 | Neymar Jr | 93 | 92 | 26 | Brazil | Paris Saint-Germain | Right |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 17988 | Wang Xuanhong | 51 | 51 | 28 | China PR | Beijing Renhe FC | Right |
| 18043 | A. Suzuki | 50 | 50 | 31 | Japan | Yokohama F. Marinos | Right |
| 18025 | J. Miszczuk | 50 | 50 | 27 | Poland | Jagiellonia Białystok | Right |
| 18183 | K. Pilkington | 48 | 48 | 44 | England | Cambridge United | Right |
| 18171 | Y. Uchimura | 48 | 48 | 33 | Japan | Hokkaido Consadole Sapporo | Right |
18207 rows × 7 columns
##Top 10 players with highest potential rating
top_10_potential_df=player_highest_potential_df.head(10)
top_10_potential_df
| Name | Potential | Overall | Age | Nationality | Club | Preferred Foot | |
|---|---|---|---|---|---|---|---|
| 25 | K. Mbappé | 95 | 88 | 19 | France | Paris Saint-Germain | Right |
| 0 | L. Messi | 94 | 94 | 31 | Argentina | FC Barcelona | Left |
| 15 | P. Dybala | 94 | 89 | 24 | Argentina | Juventus | Left |
| 1 | Cristiano Ronaldo | 94 | 94 | 33 | Portugal | Juventus | Right |
| 2 | Neymar Jr | 93 | 92 | 26 | Brazil | Paris Saint-Germain | Right |
| 3 | De Gea | 93 | 91 | 27 | Spain | Manchester United | Right |
| 9 | J. Oblak | 93 | 90 | 25 | Slovenia | Atlético Madrid | Right |
| 229 | G. Donnarumma | 93 | 82 | 19 | Italy | Milan | Right |
| 1143 | Vinícius Júnior | 92 | 77 | 17 | Brazil | Real Madrid | Right |
| 155 | O. Dembélé | 92 | 83 | 21 | France | FC Barcelona | Left |
#Top 10 players with potential rating + Top 10 players with overall rating
top_10_combined_df=pd.concat([top_10_potential_df,top_10_overall_df], axis=0)
top_10_combined_df=top_10_combined_df.drop_duplicates(subset='Name')
combined_df=top_10_combined_df.groupby('Name')[['Overall','Potential']].max()
combined_df=combined_df.sort_values('Overall',ascending=True)
combined_df
| Overall | Potential | |
|---|---|---|
| Name | ||
| Vinícius Júnior | 77 | 92 |
| G. Donnarumma | 82 | 93 |
| O. Dembélé | 83 | 92 |
| K. Mbappé | 88 | 95 |
| P. Dybala | 89 | 94 |
| D. Godín | 90 | 90 |
| J. Oblak | 90 | 93 |
| De Gea | 91 | 93 |
| E. Hazard | 91 | 91 |
| K. De Bruyne | 91 | 92 |
| L. Modrić | 91 | 91 |
| L. Suárez | 91 | 91 |
| Sergio Ramos | 91 | 91 |
| Neymar Jr | 92 | 93 |
| Cristiano Ronaldo | 94 | 94 |
| L. Messi | 94 | 94 |
combined_df.plot(kind='barh',figsize=(15,10))
plt.title('Overall Rating vs Potential Rating ',fontsize=25)
plt.xlim(0,110,10)
plt.xlabel('Name')
plt.ylabel('Rating')
plt.legend();
#Players Overall and Potential Rating Distribution
plt.figure(figsize=(10,5))
plt.hist([df.Overall,df.Potential],color=['blue','black'],alpha=0.4,stacked=True)
plt.title('Players Overall and Potential Rating Distribution',fontsize=20)
plt.xlabel('Rating',fontsize=20)
plt.ylabel('Players Frequency',fontsize=20)
plt.legend(['overall','potential'],fontsize=20);
#Clustering players By Nationality
#Players number counts country wise
country_count_df=df['Nationality'].value_counts()
country_count_df
England 1662
Germany 1198
Spain 1072
Argentina 937
France 914
...
New Caledonia 1
Fiji 1
São Tomé & Príncipe 1
United Arab Emirates 1
Botswana 1
Name: Nationality, Length: 164, dtype: int64
#Let's plot Top 20 countries with maximum number of players in FIFA 21
country_df=pd.DataFrame(country_count_df.head(20))
country_df.sort_values('Nationality',ascending=True).plot(kind='barh',figsize=(15,10),color='lightgreen')
plt.title('Top 20 Countries With Maximum Number of Players in FIFA 21',fontsize=25)
plt.ylabel('Countries',fontsize=20)
plt.xlabel('Number of Players',fontsize=20)
plt.legend(['Number of Players']);
#Clustering players by International Reputation
rep_df=df[['Name','International Reputation']].sort_values('International Reputation',ascending=False)
rep_df
| Name | International Reputation | |
|---|---|---|
| 0 | L. Messi | 5.0 |
| 2 | Neymar Jr | 5.0 |
| 22 | M. Neuer | 5.0 |
| 109 | Z. Ibrahimović | 5.0 |
| 7 | L. Suárez | 5.0 |
| ... | ... | ... |
| 7113 | S. Nakatani | 1.0 |
| 7114 | M. Niemeyer | 1.0 |
| 7115 | Léo Silva | 1.0 |
| 7116 | R. Boateng | 1.0 |
| 18206 | G. Nugent | 1.0 |
18207 rows × 2 columns
#Top 10 players with respect to their international reputation
rep_df.head(10)
| Name | International Reputation | |
|---|---|---|
| 0 | L. Messi | 5.0 |
| 2 | Neymar Jr | 5.0 |
| 22 | M. Neuer | 5.0 |
| 109 | Z. Ibrahimović | 5.0 |
| 7 | L. Suárez | 5.0 |
| 1 | Cristiano Ronaldo | 5.0 |
| 93 | A. Sánchez | 4.0 |
| 54 | Piqué | 4.0 |
| 53 | I. Rakitić | 4.0 |
| 552 | W. Rooney | 4.0 |
#Representation of The Distribution of International Reputation of Players in FIFA 21
#Plotting a pie chart to represent the distribution of international reputation of players in FIFA 21
labels=['1','2','3','4','5']
sizes=df['International Reputation'].value_counts()
colors=['green','blue','yellow','red','black']
explode=[0.1,0.1,0.2,0.5,0.9]
plt.rcParams['figure.figsize']=(10,10)
plt.pie(sizes,labels=labels,colors=colors,explode=explode,shadow=True)
plt.title('International Reputation of The Players in FIFA 21',fontsize=20)
plt.legend()
plt.show()
v=df['International Reputation'].value_counts()
print('Out of all players, {:.2f}% players have international reputation of 1 in FIFA 21'.format(v.iloc[0]/df.shape[0]*100))
Out of all players, 91.06% players have international reputation of 1 in FIFA 21
#Each Different Playing Positions Distribution in FIFA 21
#printing total different positions
c=0
for i in df.Position.unique():
if ',' not in i:
print(i)
c+=1
print('\n\nTotal {} different playing positions.'.format(c))
RF ST LW GK RCM LF RS RCB LCM CB LDM CAM CDM LS LCB RM LAM LM LB RDM RW CM RB RAM CF RWB LWB Total 27 different playing positions.
#Taking each different position players in each seperate dataframes
plis=[df[df.Position.str[-2:]=='GK'].value_counts().sum(),
df[df.Position.str[-2:]=='ST'].value_counts().sum(),
df[df.Position.str[-2:]=='CB'].value_counts().sum(),
df[df.Position.str[-2:]=='LW'].value_counts().sum(),
df[df.Position.str[-2:]=='RW'].value_counts().sum(),
df[df.Position.str[-3:]=='CDM'].value_counts().sum(),
df[df.Position.str[-2:]=='CM'].value_counts().sum(),
df[df.Position.str[-2:]=='RB'].value_counts().sum(),
df[df.Position.str[-2:]=='LB'].value_counts().sum(),
df[df.Position.str[-2:]=='CF'].value_counts().sum(),
df[df.Position.str[-2:]=='LM'].value_counts().sum(),
df[df.Position.str[-2:]=='RM'].value_counts().sum(),
df[df.Position.str[-3:]=='LWB'].value_counts().sum(),
df[df.Position.str[-3:]=='CAM'].value_counts().sum(),
df[df.Position.str[-3:]=='RWB'].value_counts().sum()]
sns.set_theme()
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (15, 8)
from IPython.display import display
with pd.option_context('display.max_columns',None):
display(df.describe())
| Unnamed: 0 | ID | Age | Overall | Potential | Special | International Reputation | Weak Foot | Skill Moves | Jersey Number | Crossing | Finishing | HeadingAccuracy | ShortPassing | Volleys | Dribbling | Curve | FKAccuracy | LongPassing | BallControl | Acceleration | SprintSpeed | Agility | Reactions | Balance | ShotPower | Jumping | Stamina | Strength | LongShots | Aggression | Interceptions | Positioning | Vision | Penalties | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 |
| mean | 9103.000000 | 214298.338606 | 25.122206 | 66.238699 | 71.307299 | 1597.809908 | 1.112924 | 2.947438 | 2.360356 | 19.508046 | 49.734181 | 45.550911 | 52.298144 | 58.686712 | 42.909026 | 55.371001 | 47.170824 | 42.750151 | 52.711933 | 58.369459 | 64.443730 | 64.556324 | 63.336189 | 61.673587 | 63.797935 | 55.313835 | 64.917834 | 63.053276 | 65.139781 | 46.985775 | 55.721700 | 46.575163 | 49.826770 | 53.260120 | 48.420607 | 58.493656 | 47.156973 | 47.572088 | 45.541056 | 16.572417 | 16.348382 | 16.189268 | 16.345691 | 16.666831 | 9731.312133 |
| std | 5256.052511 | 29965.244204 | 4.669943 | 6.908930 | 6.136496 | 272.586016 | 0.393554 | 0.659591 | 0.755394 | 15.935210 | 18.340299 | 19.500064 | 17.356983 | 14.680105 | 17.671067 | 18.885426 | 18.370998 | 17.593545 | 15.307651 | 16.664584 | 15.271849 | 15.002398 | 15.101839 | 9.540921 | 14.493574 | 17.448546 | 12.267253 | 16.201430 | 12.979956 | 19.386223 | 17.580066 | 20.807859 | 19.670807 | 14.391148 | 15.879699 | 11.810375 | 20.025458 | 21.773243 | 21.389596 | 17.692536 | 16.905507 | 16.502101 | 17.032944 | 17.951898 | 21999.290406 |
| min | 0.000000 | 16.000000 | 16.000000 | 46.000000 | 48.000000 | 731.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 5.000000 | 2.000000 | 4.000000 | 7.000000 | 4.000000 | 4.000000 | 6.000000 | 0.000000 | 9.000000 | 5.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 4551.500000 | 200315.500000 | 21.000000 | 62.000000 | 67.000000 | 1457.000000 | 1.000000 | 3.000000 | 2.000000 | 8.000000 | 38.000000 | 30.000000 | 45.000000 | 54.000000 | 30.000000 | 49.000000 | 34.000000 | 31.000000 | 43.000000 | 54.000000 | 57.000000 | 57.000000 | 55.000000 | 56.000000 | 56.000000 | 45.000000 | 58.000000 | 56.000000 | 58.000000 | 32.000000 | 44.000000 | 26.000000 | 38.000000 | 44.000000 | 39.000000 | 51.000000 | 30.000000 | 26.000000 | 24.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 1000.000000 |
| 50% | 9103.000000 | 221759.000000 | 25.000000 | 66.000000 | 71.000000 | 1635.000000 | 1.000000 | 3.000000 | 2.000000 | 17.000000 | 54.000000 | 49.000000 | 56.000000 | 62.000000 | 44.000000 | 61.000000 | 48.000000 | 41.000000 | 56.000000 | 63.000000 | 67.000000 | 67.000000 | 66.000000 | 62.000000 | 66.000000 | 59.000000 | 66.000000 | 66.000000 | 66.000000 | 51.000000 | 59.000000 | 52.000000 | 55.000000 | 55.000000 | 49.000000 | 59.000000 | 53.000000 | 55.000000 | 52.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 | 3000.000000 |
| 75% | 13654.500000 | 236529.500000 | 28.000000 | 71.000000 | 75.000000 | 1787.000000 | 1.000000 | 3.000000 | 3.000000 | 26.000000 | 64.000000 | 62.000000 | 64.000000 | 68.000000 | 57.000000 | 68.000000 | 62.000000 | 56.000000 | 64.000000 | 69.000000 | 75.000000 | 75.000000 | 74.000000 | 68.000000 | 74.000000 | 68.000000 | 73.000000 | 74.000000 | 74.000000 | 62.000000 | 69.000000 | 64.000000 | 64.000000 | 64.000000 | 60.000000 | 67.000000 | 64.000000 | 66.000000 | 64.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 9000.000000 |
| max | 18206.000000 | 246620.000000 | 45.000000 | 94.000000 | 95.000000 | 2346.000000 | 5.000000 | 5.000000 | 5.000000 | 99.000000 | 93.000000 | 95.000000 | 94.000000 | 93.000000 | 90.000000 | 97.000000 | 94.000000 | 94.000000 | 93.000000 | 96.000000 | 97.000000 | 96.000000 | 96.000000 | 96.000000 | 96.000000 | 95.000000 | 95.000000 | 96.000000 | 97.000000 | 94.000000 | 95.000000 | 92.000000 | 95.000000 | 94.000000 | 92.000000 | 96.000000 | 94.000000 | 93.000000 | 91.000000 | 90.000000 | 92.000000 | 91.000000 | 90.000000 | 94.000000 | 565000.000000 |
#Filling the missing value for the continuous variables for proper data visualization
df['ShortPassing'].fillna(df['ShortPassing'].mean(), inplace = True)
df['Volleys'].fillna(df['Volleys'].mean(), inplace = True)
df['Dribbling'].fillna(df['Dribbling'].mean(), inplace = True)
df['Curve'].fillna(df['Curve'].mean(), inplace = True)
df['FKAccuracy'].fillna(df['FKAccuracy'], inplace = True)
df['LongPassing'].fillna(df['LongPassing'].mean(), inplace = True)
df['BallControl'].fillna(df['BallControl'].mean(), inplace = True)
df['HeadingAccuracy'].fillna(df['HeadingAccuracy'].mean(), inplace = True)
df['Finishing'].fillna(df['Finishing'].mean(), inplace = True)
df['Crossing'].fillna(df['Crossing'].mean(), inplace = True)
df['Weight'].fillna('200lbs', inplace = True)
df['Contract Valid Until'].fillna(2019, inplace = True)
df['Height'].fillna("5'11", inplace = True)
df['Loaned From'].fillna('None', inplace = True)
df['Joined'].fillna('Jul 1, 2018', inplace = True)
df['Jersey Number'].fillna(8, inplace = True)
df['Body Type'].fillna('Normal', inplace = True)
df['Position'].fillna('ST', inplace = True)
df['Club'].fillna('No Club', inplace = True)
df['Work Rate'].fillna('Medium/ Medium', inplace = True)
df['Skill Moves'].fillna(df['Skill Moves'].median(), inplace = True)
df['Weak Foot'].fillna(3, inplace = True)
df['Preferred Foot'].fillna('Right', inplace = True)
df['International Reputation'].fillna(1, inplace = True)
df['Wage'].fillna('€200K', inplace = True)
df.fillna(0, inplace = True)
def defending(df):
return int(round((df[['Marking', 'StandingTackle',
'SlidingTackle']].mean()).mean()))
def general(df):
return int(round((df[['HeadingAccuracy', 'Dribbling', 'Curve',
'BallControl']].mean()).mean()))
def mental(df):
return int(round((df[['Aggression', 'Interceptions', 'Positioning',
'Vision','Composure']].mean()).mean()))
def passing(df):
return int(round((df[['Crossing', 'ShortPassing',
'LongPassing']].mean()).mean()))
def mobility(df):
return int(round((df[['Acceleration', 'SprintSpeed',
'Agility','Reactions']].mean()).mean()))
def power(df):
return int(round((df[['Balance', 'Jumping', 'Stamina',
'Strength']].mean()).mean()))
def rating(df):
return int(round((df[['Potential', 'Overall']].mean()).mean()))
def shooting(df):
return int(round((df[['Finishing', 'Volleys', 'FKAccuracy',
'ShotPower','LongShots', 'Penalties']].mean()).mean()))
#Defining a function for cleaning the Weight data
def extract_value_from(value):
out = value.replace('lbs', '')
return float(out)
# applying the function to weight column
#data['value'] = data['value'].apply(lambda x: extract_value_from(x))
df['Weight'] = df['Weight'].apply(lambda x : extract_value_from(x))
df['Weight'].head()
0 159.0 1 183.0 2 150.0 3 168.0 4 154.0 Name: Weight, dtype: float64
#Defining a function for cleaning the wage column
def extract_value_from(Value):
out = Value.replace('€', '')
if 'M' in out:
out = float(out.replace('M', ''))*1000000
elif 'K' in Value:
out = float(out.replace('K', ''))*1000
return float(out)
#Skill Moves of Players
plt.figure(figsize = (10, 8))
ax = sns.countplot(x = 'Skill Moves', data = df, palette = 'pastel')
ax.set_title(label = 'Count of players on Basis of their skill moves', fontsize = 20)
ax.set_xlabel(xlabel = 'Number of Skill Moves', fontsize = 16)
ax.set_ylabel(ylabel = 'Count', fontsize = 16)
plt.show()
#Height of Players
plt.figure(figsize = (13, 8))
ax = sns.countplot(x = 'Height', data = df, palette = 'dark')
ax.set_title(label = 'Count of players on Basis of Height', fontsize = 20)
ax.set_xlabel(xlabel = 'Height in Foot per inch', fontsize = 16)
ax.set_ylabel(ylabel = 'Count', fontsize = 16)
plt.show()
# Checking for records that contain missing values
null_val = df[df.isnull().any(axis = 1)]
null_val
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | Normalized_Wage |
|---|
0 rows × 90 columns
# Overall rating distribution
plt.figure(figsize=(15,8))
sns.distplot(df['Overall'], bins=15, color='r')
plt.title('Overall Rating Distribution in FIFA 23', fontsize = 16)
plt.show()
#Correlation Heatmap
#Next up, we want to see the correlation among the
#relevant player attributes (Overall Rating, Potential Rating, Value, Age, Height, Weight, Wage, Release Clause).
#Correlation matrices give such valuable insights as they show which attribute is influenced by the other. We will first of all,
#compute the correlation matrix of these relevant attributes.
# compute correlation matrix of the relevant columns
relevant_col = df[['Overall', 'Potential', 'Age']].corr()
corr = relevant_col.corr()
corr
| Overall | Potential | Age | |
|---|---|---|---|
| Overall | 1.000000 | 0.601936 | -0.309101 |
| Potential | 0.601936 | 1.000000 | -0.945498 |
| Age | -0.309101 | -0.945498 | 1.000000 |
# Display correlation heatmap
plt.figure(figsize=(15,10))
mask= np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, annot=True, mask=mask, cmap= 'vlag', center=0)
<Axes: >
#Age vs Overall Rating
#Now we’re going to create a scatter plot showing
#the relationship between Age and Overall Rating.
# Age vs Overall rating
plt.figure(figsize=(15,8))
sns.scatterplot(x = 'Age', y = 'Overall', data = df)
plt.title('Age vs Overall Rating', fontsize = 15)
plt.xlabel('Age', fontsize = 15)
plt.ylabel('Overall Rating', fontsize = 15)
Text(0, 0.5, 'Overall Rating')
# The Top 10 ranked by Overall rating
top_ten_overall = df[['Name','Overall','Age']].nlargest(10, 'Overall')
top_ten_overall
| Name | Overall | Age | |
|---|---|---|---|
| 0 | L. Messi | 94 | 31 |
| 1 | Cristiano Ronaldo | 94 | 33 |
| 2 | Neymar Jr | 92 | 26 |
| 3 | De Gea | 91 | 27 |
| 4 | K. De Bruyne | 91 | 27 |
| 5 | E. Hazard | 91 | 27 |
| 6 | L. Modrić | 91 | 32 |
| 7 | L. Suárez | 91 | 31 |
| 8 | Sergio Ramos | 91 | 32 |
| 9 | J. Oblak | 90 | 25 |
#Count of Players on Basis of Age
df['Age'].value_counts()
21 1423 26 1387 24 1358 22 1340 23 1332 25 1319 20 1240 27 1162 28 1101 19 1024 29 959 30 917 18 732 31 707 32 574 33 408 34 404 17 289 35 196 36 127 37 82 16 42 38 37 39 25 40 13 41 5 44 2 45 1 42 1 Name: Age, dtype: int64
plt.figure(figsize =(15,7))
sns.countplot(x = 'Age', data = df)
plt.title('Count of Players on the Basis of Age (FIFA 23)', fontsize = 16)
plt.xlabel('Age', fontsize = 16)
plt.ylabel('Count', fontsize = 16)
Text(0, 0.5, 'Count')
library("DALEX")
df_gbm_exp_deep <- DALEX::explain(df_gbm_deep,
df = fifa_small, y = 10^fifa_small$LogValue,
predict_function = function(m,x) 10^predict(m, x, n.trees = 250),
label = "GBM deep")
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
import numpy as np
X = df.drop(["Nationality", "Overall", "Potential",
"Value", "Wage"], axis = 1)
y = df['Value']
ylog = np.log(y)
X_train, X_test, ylog_train, ylog_test, y_train, y_test =
train_test_split(X, ylog, y, test_size = 0.25, random_state = 4)
gbm_model = LGBMRegressor()
gbm_model.fit(X_train, ylog_train, verbose = False)
Cell In[207], line 2 df_gbm_exp_deep <- DALEX::explain(df_gbm_deep, ^ SyntaxError: invalid syntax
df.describe().apply(lambda s: s.apply(lambda x: format(x,
'f')))
| Unnamed: 0 | ID | Age | Overall | Potential | Special | International Reputation | Weak Foot | Skill Moves | Jersey Number | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | ... | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 |
| mean | 9103.000000 | 214298.338606 | 25.122206 | 66.238699 | 71.307299 | 1597.809908 | 1.112924 | 2.947438 | 2.360356 | 19.508046 | ... | 58.493656 | 47.156973 | 47.572088 | 45.541056 | 16.572417 | 16.348382 | 16.189268 | 16.345691 | 16.666831 | 9731.312133 |
| std | 5256.052511 | 29965.244204 | 4.669943 | 6.908930 | 6.136496 | 272.586016 | 0.393554 | 0.659591 | 0.755394 | 15.935210 | ... | 11.810375 | 20.025458 | 21.773243 | 21.389596 | 17.692536 | 16.905507 | 16.502101 | 17.032944 | 17.951898 | 21999.290406 |
| min | 0.000000 | 16.000000 | 16.000000 | 46.000000 | 48.000000 | 731.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 4551.500000 | 200315.500000 | 21.000000 | 62.000000 | 67.000000 | 1457.000000 | 1.000000 | 3.000000 | 2.000000 | 8.000000 | ... | 51.000000 | 30.000000 | 26.000000 | 24.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 1000.000000 |
| 50% | 9103.000000 | 221759.000000 | 25.000000 | 66.000000 | 71.000000 | 1635.000000 | 1.000000 | 3.000000 | 2.000000 | 17.000000 | ... | 59.000000 | 53.000000 | 55.000000 | 52.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 | 3000.000000 |
| 75% | 13654.500000 | 236529.500000 | 28.000000 | 71.000000 | 75.000000 | 1787.000000 | 1.000000 | 3.000000 | 3.000000 | 26.000000 | ... | 67.000000 | 64.000000 | 66.000000 | 64.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 9000.000000 |
| max | 18206.000000 | 246620.000000 | 45.000000 | 94.000000 | 95.000000 | 2346.000000 | 5.000000 | 5.000000 | 5.000000 | 99.000000 | ... | 96.000000 | 94.000000 | 93.000000 | 91.000000 | 90.000000 | 92.000000 | 91.000000 | 90.000000 | 94.000000 | 565000.000000 |
8 rows × 46 columns
player_name = df[["Acceleration","Name","Position","Age" , "Nationality","SprintSpeed"]].nlargest(7, ["Acceleration"]).set_index("Name")
player_name
| Acceleration | Position | Age | Nationality | SprintSpeed | |
|---|---|---|---|---|---|
| Name | |||||
| Douglas Costa | 97.0 | LM | 27 | Brazil | 93.0 |
| Adama | 97.0 | RW | 22 | Spain | 96.0 |
| K. Mbappé | 96.0 | RM | 19 | France | 96.0 |
| K. Manneh | 96.0 | LM | 23 | United States | 93.0 |
| S. Mané | 95.0 | LM | 26 | Senegal | 93.0 |
| R. Sterling | 95.0 | RW | 23 | England | 92.0 |
| K. Coman | 95.0 | LM | 22 | France | 93.0 |
#Players Overall and Potential Rating vs Age in FIFA 21
fig, axes = plt.subplots(1, 2, sharex=True, figsize=(18,10))
fig.suptitle('Overall and Potential Rating vs Age',fontsize=20)
axes[0].set_title('Overall vs Age')
axes[1].set_title('Potential vs Age')
sns.scatterplot(ax=axes[0],x=df.Age,y=df.Overall);
sns.scatterplot(x=df.Age,y=df.Potential);
# Removing spaces in the column names to enable easy column reference
df.columns = df.columns.str.replace(' ','')
df.head(5)
| Unnamed:0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | ReleaseClause | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 158023 | L. Messi | 31 | https://cdn.sofifa.org/players/4/19/158023.png | Argentina | https://cdn.sofifa.org/flags/52.png | 94 | 94 | FC Barcelona | ... | 33.0 | 28.0 | 26.0 | 6.0 | 11.0 | 15.0 | 14.0 | 8.0 | NaN | 565000.0 |
| 1 | 1 | 20801 | Cristiano Ronaldo | 33 | https://cdn.sofifa.org/players/4/19/20801.png | Portugal | https://cdn.sofifa.org/flags/38.png | 94 | 94 | Juventus | ... | 28.0 | 31.0 | 23.0 | 7.0 | 11.0 | 15.0 | 14.0 | 11.0 | NaN | 405000.0 |
| 2 | 2 | 190871 | Neymar Jr | 26 | https://cdn.sofifa.org/players/4/19/190871.png | Brazil | https://cdn.sofifa.org/flags/54.png | 92 | 93 | Paris Saint-Germain | ... | 27.0 | 24.0 | 33.0 | 9.0 | 9.0 | 15.0 | 15.0 | 11.0 | 8.1 | 290000.0 |
| 3 | 3 | 193080 | De Gea | 27 | https://cdn.sofifa.org/players/4/19/193080.png | Spain | https://cdn.sofifa.org/flags/45.png | 91 | 93 | Manchester United | ... | 15.0 | 21.0 | 13.0 | 90.0 | 85.0 | 87.0 | 88.0 | 94.0 | NaN | 260000.0 |
| 4 | 4 | 192985 | K. De Bruyne | 27 | https://cdn.sofifa.org/players/4/19/192985.png | Belgium | https://cdn.sofifa.org/flags/7.png | 91 | 92 | Manchester City | ... | 68.0 | 58.0 | 51.0 | 15.0 | 13.0 | 5.0 | 10.0 | 13.0 | NaN | 355000.0 |
5 rows × 90 columns
df.tail()
| Unnamed:0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | ReleaseClause | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 18202 | 18202 | 238813 | J. Lundstram | 19 | https://cdn.sofifa.org/players/4/19/238813.png | England | https://cdn.sofifa.org/flags/14.png | 47 | 65 | Crewe Alexandra | ... | 40.0 | 48.0 | 47.0 | 10.0 | 13.0 | 7.0 | 8.0 | 9.0 | NaN | 1000.0 |
| 18203 | 18203 | 243165 | N. Christoffersson | 19 | https://cdn.sofifa.org/players/4/19/243165.png | Sweden | https://cdn.sofifa.org/flags/46.png | 47 | 63 | Trelleborgs FF | ... | 22.0 | 15.0 | 19.0 | 10.0 | 9.0 | 9.0 | 5.0 | 12.0 | NaN | 1000.0 |
| 18204 | 18204 | 241638 | B. Worman | 16 | https://cdn.sofifa.org/players/4/19/241638.png | England | https://cdn.sofifa.org/flags/14.png | 47 | 67 | Cambridge United | ... | 32.0 | 13.0 | 11.0 | 6.0 | 5.0 | 10.0 | 6.0 | 13.0 | NaN | 1000.0 |
| 18205 | 18205 | 246268 | D. Walker-Rice | 17 | https://cdn.sofifa.org/players/4/19/246268.png | England | https://cdn.sofifa.org/flags/14.png | 47 | 66 | Tranmere Rovers | ... | 20.0 | 25.0 | 27.0 | 14.0 | 6.0 | 14.0 | 8.0 | 9.0 | NaN | 1000.0 |
| 18206 | 18206 | 246269 | G. Nugent | 16 | https://cdn.sofifa.org/players/4/19/246269.png | England | https://cdn.sofifa.org/flags/14.png | 46 | 66 | Tranmere Rovers | ... | 40.0 | 43.0 | 50.0 | 10.0 | 15.0 | 9.0 | 12.0 | 9.0 | NaN | 1000.0 |
5 rows × 90 columns
df.info
<bound method DataFrame.info of Unnamed:0 ID Name Age \
0 0 158023 L. Messi 31
1 1 20801 Cristiano Ronaldo 33
2 2 190871 Neymar Jr 26
3 3 193080 De Gea 27
4 4 192985 K. De Bruyne 27
... ... ... ... ...
18202 18202 238813 J. Lundstram 19
18203 18203 243165 N. Christoffersson 19
18204 18204 241638 B. Worman 16
18205 18205 246268 D. Walker-Rice 17
18206 18206 246269 G. Nugent 16
Photo Nationality \
0 https://cdn.sofifa.org/players/4/19/158023.png Argentina
1 https://cdn.sofifa.org/players/4/19/20801.png Portugal
2 https://cdn.sofifa.org/players/4/19/190871.png Brazil
3 https://cdn.sofifa.org/players/4/19/193080.png Spain
4 https://cdn.sofifa.org/players/4/19/192985.png Belgium
... ... ...
18202 https://cdn.sofifa.org/players/4/19/238813.png England
18203 https://cdn.sofifa.org/players/4/19/243165.png Sweden
18204 https://cdn.sofifa.org/players/4/19/241638.png England
18205 https://cdn.sofifa.org/players/4/19/246268.png England
18206 https://cdn.sofifa.org/players/4/19/246269.png England
Flag Overall Potential \
0 https://cdn.sofifa.org/flags/52.png 94 94
1 https://cdn.sofifa.org/flags/38.png 94 94
2 https://cdn.sofifa.org/flags/54.png 92 93
3 https://cdn.sofifa.org/flags/45.png 91 93
4 https://cdn.sofifa.org/flags/7.png 91 92
... ... ... ...
18202 https://cdn.sofifa.org/flags/14.png 47 65
18203 https://cdn.sofifa.org/flags/46.png 47 63
18204 https://cdn.sofifa.org/flags/14.png 47 67
18205 https://cdn.sofifa.org/flags/14.png 47 66
18206 https://cdn.sofifa.org/flags/14.png 46 66
Club ... Marking StandingTackle SlidingTackle \
0 FC Barcelona ... 33.0 28.0 26.0
1 Juventus ... 28.0 31.0 23.0
2 Paris Saint-Germain ... 27.0 24.0 33.0
3 Manchester United ... 15.0 21.0 13.0
4 Manchester City ... 68.0 58.0 51.0
... ... ... ... ... ...
18202 Crewe Alexandra ... 40.0 48.0 47.0
18203 Trelleborgs FF ... 22.0 15.0 19.0
18204 Cambridge United ... 32.0 13.0 11.0
18205 Tranmere Rovers ... 20.0 25.0 27.0
18206 Tranmere Rovers ... 40.0 43.0 50.0
GKDiving GKHandling GKKicking GKPositioning GKReflexes \
0 6.0 11.0 15.0 14.0 8.0
1 7.0 11.0 15.0 14.0 11.0
2 9.0 9.0 15.0 15.0 11.0
3 90.0 85.0 87.0 88.0 94.0
4 15.0 13.0 5.0 10.0 13.0
... ... ... ... ... ...
18202 10.0 13.0 7.0 8.0 9.0
18203 10.0 9.0 9.0 5.0 12.0
18204 6.0 5.0 10.0 6.0 13.0
18205 14.0 6.0 14.0 8.0 9.0
18206 10.0 15.0 9.0 12.0 9.0
ReleaseClause Normalized_Wage
0 NaN 565000.0
1 NaN 405000.0
2 8.1 290000.0
3 NaN 260000.0
4 NaN 355000.0
... ... ...
18202 NaN 1000.0
18203 NaN 1000.0
18204 NaN 1000.0
18205 NaN 1000.0
18206 NaN 1000.0
[18207 rows x 90 columns]>
import warnings
import numpy as np
import pandas as pd
from pyod.models.mad import MAD
from pyod.models.knn import KNN
from pyod.models.lof import LOF
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
#Parametric methods: Univariate
df.hist(figsize=(6,6));
# enough variation between features to show outliers
df.describe()
| Unnamed:0 | ID | Age | Overall | Potential | Value | Wage | Special | InternationalReputation | WeakFoot | ... | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | ReleaseClause | Normalized_Wage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 0.0 | 0.0 | 18207.000000 | 18207.000000 | 18207.000000 | ... | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 34.000000 | 18207.000000 |
| mean | 9103.000000 | 214298.338606 | 25.122206 | 66.238699 | 71.307299 | NaN | NaN | 1597.809908 | 1.112924 | 2.947438 | ... | 47.156973 | 47.572088 | 45.541056 | 16.572417 | 16.348382 | 16.189268 | 16.345691 | 16.666831 | 0.752941 | 9731.312133 |
| std | 5256.052511 | 29965.244204 | 4.669943 | 6.908930 | 6.136496 | NaN | NaN | 272.586016 | 0.393554 | 0.659591 | ... | 20.025458 | 21.773243 | 21.389596 | 17.692536 | 16.905507 | 16.502101 | 17.032944 | 17.951898 | 1.329670 | 21999.290406 |
| min | 0.000000 | 16.000000 | 16.000000 | 46.000000 | 48.000000 | NaN | NaN | 731.000000 | 1.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.100000 | 0.000000 |
| 25% | 4551.500000 | 200315.500000 | 21.000000 | 62.000000 | 67.000000 | NaN | NaN | 1457.000000 | 1.000000 | 3.000000 | ... | 30.000000 | 26.000000 | 24.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 0.200000 | 1000.000000 |
| 50% | 9103.000000 | 221759.000000 | 25.000000 | 66.000000 | 71.000000 | NaN | NaN | 1635.000000 | 1.000000 | 3.000000 | ... | 53.000000 | 55.000000 | 52.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 | 0.700000 | 3000.000000 |
| 75% | 13654.500000 | 236529.500000 | 28.000000 | 71.000000 | 75.000000 | NaN | NaN | 1787.000000 | 1.000000 | 3.000000 | ... | 64.000000 | 66.000000 | 64.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 0.800000 | 9000.000000 |
| max | 18206.000000 | 246620.000000 | 45.000000 | 94.000000 | 95.000000 | NaN | NaN | 2346.000000 | 5.000000 | 5.000000 | ... | 94.000000 | 93.000000 | 91.000000 | 90.000000 | 92.000000 | 91.000000 | 90.000000 | 94.000000 | 8.100000 | 565000.000000 |
8 rows × 49 columns
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("/Users/prose/OneDrive/Desktop/Data/data.csv")
df.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 158023 | L. Messi | 31 | https://cdn.sofifa.org/players/4/19/158023.png | Argentina | https://cdn.sofifa.org/flags/52.png | 94 | 94 | FC Barcelona | ... | 96.0 | 33.0 | 28.0 | 26.0 | 6.0 | 11.0 | 15.0 | 14.0 | 8.0 | €226.5M |
| 1 | 1 | 20801 | Cristiano Ronaldo | 33 | https://cdn.sofifa.org/players/4/19/20801.png | Portugal | https://cdn.sofifa.org/flags/38.png | 94 | 94 | Juventus | ... | 95.0 | 28.0 | 31.0 | 23.0 | 7.0 | 11.0 | 15.0 | 14.0 | 11.0 | €127.1M |
| 2 | 2 | 190871 | Neymar Jr | 26 | https://cdn.sofifa.org/players/4/19/190871.png | Brazil | https://cdn.sofifa.org/flags/54.png | 92 | 93 | Paris Saint-Germain | ... | 94.0 | 27.0 | 24.0 | 33.0 | 9.0 | 9.0 | 15.0 | 15.0 | 11.0 | €228.1M |
| 3 | 3 | 193080 | De Gea | 27 | https://cdn.sofifa.org/players/4/19/193080.png | Spain | https://cdn.sofifa.org/flags/45.png | 91 | 93 | Manchester United | ... | 68.0 | 15.0 | 21.0 | 13.0 | 90.0 | 85.0 | 87.0 | 88.0 | 94.0 | €138.6M |
| 4 | 4 | 192985 | K. De Bruyne | 27 | https://cdn.sofifa.org/players/4/19/192985.png | Belgium | https://cdn.sofifa.org/flags/7.png | 91 | 92 | Manchester City | ... | 88.0 | 68.0 | 58.0 | 51.0 | 15.0 | 13.0 | 5.0 | 10.0 | 13.0 | €196.4M |
5 rows × 89 columns
# enough variation between features to show outliers
df.describe()
| Unnamed: 0 | ID | Age | Overall | Potential | Special | International Reputation | Weak Foot | Skill Moves | Jersey Number | ... | Penalties | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18207.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18147.000000 | ... | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 | 18159.000000 |
| mean | 9103.000000 | 214298.338606 | 25.122206 | 66.238699 | 71.307299 | 1597.809908 | 1.113222 | 2.947299 | 2.361308 | 19.546096 | ... | 48.548598 | 58.648274 | 47.281623 | 47.697836 | 45.661435 | 16.616223 | 16.391596 | 16.232061 | 16.388898 | 16.710887 |
| std | 5256.052511 | 29965.244204 | 4.669943 | 6.908930 | 6.136496 | 272.586016 | 0.394031 | 0.660456 | 0.756164 | 15.947765 | ... | 15.704053 | 11.436133 | 19.904397 | 21.664004 | 21.289135 | 17.695349 | 16.906900 | 16.502864 | 17.034669 | 17.955119 |
| min | 0.000000 | 16.000000 | 16.000000 | 46.000000 | 48.000000 | 731.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 5.000000 | 3.000000 | 3.000000 | 2.000000 | 3.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 25% | 4551.500000 | 200315.500000 | 21.000000 | 62.000000 | 67.000000 | 1457.000000 | 1.000000 | 3.000000 | 2.000000 | 8.000000 | ... | 39.000000 | 51.000000 | 30.000000 | 27.000000 | 24.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 |
| 50% | 9103.000000 | 221759.000000 | 25.000000 | 66.000000 | 71.000000 | 1635.000000 | 1.000000 | 3.000000 | 2.000000 | 17.000000 | ... | 49.000000 | 60.000000 | 53.000000 | 55.000000 | 52.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 |
| 75% | 13654.500000 | 236529.500000 | 28.000000 | 71.000000 | 75.000000 | 1787.000000 | 1.000000 | 3.000000 | 3.000000 | 26.000000 | ... | 60.000000 | 67.000000 | 64.000000 | 66.000000 | 64.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 |
| max | 18206.000000 | 246620.000000 | 45.000000 | 94.000000 | 95.000000 | 2346.000000 | 5.000000 | 5.000000 | 5.000000 | 99.000000 | ... | 92.000000 | 96.000000 | 94.000000 | 93.000000 | 91.000000 | 90.000000 | 92.000000 | 91.000000 | 90.000000 | 94.000000 |
8 rows × 44 columns
#Parametric methods: Univariate
df.hist(figsize=(6,6));
def out_std(s, nstd=3.0, return_thresholds=False):
"""
Return a boolean mask of outliers for a series
using standard deviation, works column-wise.
param nstd:
Set number of standard deviations from the mean
to consider an outlier
:type nstd: ``float``
param return_thresholds:
True returns the lower and upper bounds, good for plotting.
False returns the masked array
:type return_thresholds: ``bool``
"""
data_mean, data_std = s.mean(), s.std()
cut_off = df_std * nstd
lower, upper = df_mean - cut_off, df_mean + cut_off
if return_thresholds:
return lower, upper
else:
return [True if x < lower or x > upper else False for x in s]
def out_iqr(s, k=1.5, return_thresholds=False):
"""
Return a boolean mask of outliers for a series
using interquartile range, works column-wise.
param k:
some cutoff to multiply by the iqr
:type k: ``float``
param return_thresholds:
True returns the lower and upper bounds, good for plotting.
False returns the masked array
:type return_thresholds: ``bool``
"""
# calculate interquartile range
q25, q75 = np.percentile(s, 25), np.percentile(s, 75)
iqr = q75 - q25
# calculate the outlier cutoff
cut_off = iqr * k
lower, upper = q25 - cut_off, q75 + cut_off
if return_thresholds:
return lower, upper
else: # identify outliers
return [True if x < lower or x > upper else False for x in s]
df['Height'].describe()
count 18159 unique 21 top 6'0 freq 2881 Name: Height, dtype: object
df['Weight'].describe()
count 18159 unique 57 top 165lbs freq 1483 Name: Weight, dtype: object
# Create a histogram using Seaborn
g = sns.histplot(data = df, x = 'Weight')
# Add labels
g.set_xlabel('Weight Distribution')
Text(0.5, 0, 'Weight Distribution')
# Create a histogram using Seaborn
g = sns.histplot(data = df, x = 'Height')
# Add labels
g.set_xlabel('Height Distribution')
Text(0.5, 0, 'Height Distribution')
df['Weight'] = df['Weight'].str.replace('lbs', '')
df['Value'] = df['Value'].str.replace('M', '')
df["Value"].head()
0 €110.5 1 €77 2 €118.5 3 €72 4 €102 Name: Value, dtype: object
df["Weight"].head()
0 159 1 183 2 150 3 168 4 154 Name: Weight, dtype: object
# Box Plot
import seaborn as sns
sns.boxplot(df['Weight'])
<Axes: >
import pandas as pd
# Use Regular Expression to convert Height from feet + inches into cm. Convert from text into integer
r = re.compile(r"([0-9]+)'([0-9]+)") # to set the pattern e.g. 5'7 , 4'12 etc
def get_cm(height):
height = str(height)
m = r.match(height)
if m == None:
return float('NaN')
else:
return float(m.group(1))*30.48 + float(m.group(2))*2.54
df["Height"] = df["Height"].apply(lambda x:get_cm(x))
# Box Plot
import seaborn as sns
sns.boxplot(df['Height'])
<Axes: >
# Import zscore function
from scipy.stats import zscore
# Calculate z-score for each data point and compute its absolute value
z_scores = zscore(df['Height'])
abs_z_scores = np.abs(z_scores)
# Select the outliers using a threshold of 3
outliers = df[abs_z_scores > 3]
outliers.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause |
|---|
0 rows × 89 columns
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation
# Obtain the MAD value
mad_score = median_abs_deviation(df['Height'])
mad_score
nan
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation
# Obtain the MAD value
mad_score = median_abs_deviation(df['Age'])
mad_score
4.0
# Calculate the percentiles
seventy_fifth = df['Age'].quantile(0.75)
twenty_fifth = df['Age'].quantile(0.25)
# Obtain IQR
iqr = seventy_fifth - twenty_fifth
# Upper and lower thresholds
upper = seventy_fifth + (1.5 * iqr)
lower = twenty_fifth - (1.5 * iqr)
# Subset the dataset
outliers = df[(df['Age'] < lower) | (df['Age'] > upper)]
outliers.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 41 | 41 | 1179 | G. Buffon | 40 | https://cdn.sofifa.org/players/4/19/1179.png | Italy | https://cdn.sofifa.org/flags/27.png | 88 | 88 | Paris Saint-Germain | ... | 70.0 | 13.0 | 11.0 | 11.0 | 88.0 | 87.0 | 74.0 | 90.0 | 83.0 | €7.4M |
| 554 | 554 | 49031 | S. Sorrentino | 39 | https://cdn.sofifa.org/players/4/19/49031.png | Italy | https://cdn.sofifa.org/flags/27.png | 80 | 80 | Chievo Verona | ... | 66.0 | 25.0 | 10.0 | 13.0 | 81.0 | 82.0 | 66.0 | 82.0 | 79.0 | €1.7M |
| 864 | 864 | 153260 | Hilton | 40 | https://cdn.sofifa.org/players/4/19/153260.png | Brazil | https://cdn.sofifa.org/flags/54.png | 78 | 78 | Montpellier HSC | ... | 70.0 | 83.0 | 77.0 | 76.0 | 12.0 | 7.0 | 11.0 | 12.0 | 13.0 | NaN |
| 1120 | 1120 | 156092 | J. Villar | 41 | https://cdn.sofifa.org/players/4/19/156092.png | Paraguay | https://cdn.sofifa.org/flags/58.png | 77 | 77 | NaN | ... | 55.0 | 13.0 | 13.0 | 14.0 | 75.0 | 75.0 | 74.0 | 78.0 | 77.0 | NaN |
| 1294 | 1294 | 14907 | A. Bizzarri | 40 | https://cdn.sofifa.org/players/4/19/14907.png | Argentina | https://cdn.sofifa.org/flags/52.png | 76 | 76 | Foggia | ... | 60.0 | 11.0 | 12.0 | 11.0 | 76.0 | 74.0 | 66.0 | 82.0 | 76.0 | €840K |
5 rows × 89 columns
df['Weight'].head()
0 159 1 183 2 150 3 168 4 154 Name: Weight, dtype: object
df['Age'].head()
0 31 1 33 2 26 3 27 4 27 Name: Age, dtype: int64
# Convert the 'Weight' column to int64
df['Weight'] = pd.to_numeric(df['Weight'], errors='coerce').astype('Int64')
# Display the DataFrame with 'Weight' column updated to int64
print(df)
Unnamed: 0 ID Name Age \
0 0 158023 L. Messi 31
1 1 20801 Cristiano Ronaldo 33
2 2 190871 Neymar Jr 26
3 3 193080 De Gea 27
4 4 192985 K. De Bruyne 27
... ... ... ... ...
18202 18202 238813 J. Lundstram 19
18203 18203 243165 N. Christoffersson 19
18204 18204 241638 B. Worman 16
18205 18205 246268 D. Walker-Rice 17
18206 18206 246269 G. Nugent 16
Photo Nationality \
0 https://cdn.sofifa.org/players/4/19/158023.png Argentina
1 https://cdn.sofifa.org/players/4/19/20801.png Portugal
2 https://cdn.sofifa.org/players/4/19/190871.png Brazil
3 https://cdn.sofifa.org/players/4/19/193080.png Spain
4 https://cdn.sofifa.org/players/4/19/192985.png Belgium
... ... ...
18202 https://cdn.sofifa.org/players/4/19/238813.png England
18203 https://cdn.sofifa.org/players/4/19/243165.png Sweden
18204 https://cdn.sofifa.org/players/4/19/241638.png England
18205 https://cdn.sofifa.org/players/4/19/246268.png England
18206 https://cdn.sofifa.org/players/4/19/246269.png England
Flag Overall Potential \
0 https://cdn.sofifa.org/flags/52.png 94 94
1 https://cdn.sofifa.org/flags/38.png 94 94
2 https://cdn.sofifa.org/flags/54.png 92 93
3 https://cdn.sofifa.org/flags/45.png 91 93
4 https://cdn.sofifa.org/flags/7.png 91 92
... ... ... ...
18202 https://cdn.sofifa.org/flags/14.png 47 65
18203 https://cdn.sofifa.org/flags/46.png 47 63
18204 https://cdn.sofifa.org/flags/14.png 47 67
18205 https://cdn.sofifa.org/flags/14.png 47 66
18206 https://cdn.sofifa.org/flags/14.png 46 66
Club ... Composure Marking StandingTackle \
0 FC Barcelona ... 96.0 33.0 28.0
1 Juventus ... 95.0 28.0 31.0
2 Paris Saint-Germain ... 94.0 27.0 24.0
3 Manchester United ... 68.0 15.0 21.0
4 Manchester City ... 88.0 68.0 58.0
... ... ... ... ... ...
18202 Crewe Alexandra ... 45.0 40.0 48.0
18203 Trelleborgs FF ... 42.0 22.0 15.0
18204 Cambridge United ... 41.0 32.0 13.0
18205 Tranmere Rovers ... 46.0 20.0 25.0
18206 Tranmere Rovers ... 43.0 40.0 43.0
SlidingTackle GKDiving GKHandling GKKicking GKPositioning \
0 26.0 6.0 11.0 15.0 14.0
1 23.0 7.0 11.0 15.0 14.0
2 33.0 9.0 9.0 15.0 15.0
3 13.0 90.0 85.0 87.0 88.0
4 51.0 15.0 13.0 5.0 10.0
... ... ... ... ... ...
18202 47.0 10.0 13.0 7.0 8.0
18203 19.0 10.0 9.0 9.0 5.0
18204 11.0 6.0 5.0 10.0 6.0
18205 27.0 14.0 6.0 14.0 8.0
18206 50.0 10.0 15.0 9.0 12.0
GKReflexes Release Clause
0 8.0 €226.5M
1 11.0 €127.1M
2 11.0 €228.1M
3 94.0 €138.6M
4 13.0 €196.4M
... ... ...
18202 9.0 €143K
18203 12.0 €113K
18204 13.0 €165K
18205 9.0 €143K
18206 9.0 €165K
[18207 rows x 89 columns]
# Calculate the percentiles
seventy_fifth = df['Weight'].quantile(0.75)
twenty_fifth = df['Weight'].quantile(0.25)
# Obtain IQR
iqr = seventy_fifth - twenty_fifth
# Upper and lower thresholds
upper = seventy_fifth + (1.5 * iqr)
lower = twenty_fifth - (1.5 * iqr)
# Subset the dataset
outliers = df[(df['Weight'] < lower) | (df['Weight'] > upper)]
outliers.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19 | 19 | 192119 | T. Courtois | 26 | https://cdn.sofifa.org/players/4/19/192119.png | Belgium | https://cdn.sofifa.org/flags/7.png | 89 | 90 | Real Madrid | ... | 66.0 | 20.0 | 18.0 | 16.0 | 85.0 | 91.0 | 72.0 | 86.0 | 88.0 | €113.7M |
| 115 | 115 | 212190 | N. Süle | 22 | https://cdn.sofifa.org/players/4/19/212190.png | Germany | https://cdn.sofifa.org/flags/21.png | 84 | 90 | FC Bayern München | ... | 72.0 | 82.0 | 85.0 | 83.0 | 15.0 | 7.0 | 14.0 | 7.0 | 15.0 | €67.5M |
| 165 | 165 | 213331 | J. Tah | 22 | https://cdn.sofifa.org/players/4/19/213331.png | Germany | https://cdn.sofifa.org/flags/21.png | 83 | 88 | Bayer 04 Leverkusen | ... | 75.0 | 80.0 | 88.0 | 84.0 | 11.0 | 8.0 | 7.0 | 9.0 | 14.0 | €52.4M |
| 210 | 210 | 179783 | R. Fährmann | 29 | https://cdn.sofifa.org/players/4/19/179783.png | Germany | https://cdn.sofifa.org/flags/21.png | 83 | 84 | FC Schalke 04 | ... | 61.0 | 10.0 | 12.0 | 10.0 | 83.0 | 81.0 | 52.0 | 82.0 | 87.0 | €35.5M |
| 259 | 259 | 203263 | H. Maguire | 25 | https://cdn.sofifa.org/players/4/19/203263.png | England | https://cdn.sofifa.org/flags/14.png | 82 | 85 | Leicester City | ... | 79.0 | 81.0 | 84.0 | 81.0 | 14.0 | 16.0 | 9.0 | 14.0 | 9.0 | €46.4M |
5 rows × 89 columns
# Remove rows with NA values in the 'Weight' column
df = df.dropna(subset=['Weight'])
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation
# Obtain the MAD value
mad_score = median_abs_deviation(df['Weight'])
mad_score
11.0
# Box Plot
import seaborn as sns
sns.boxplot(df['Weight'])
<Axes: >
# Create a box plot
g = sns.boxplot(data = df, x = 'Weight')
# Add a title and change xlabel
g.set_title('Box Plot of Weight')
g.set_xlabel('Weight')
Text(0.5, 0, 'Weight')
df['Height'].head()
0 170.18 1 187.96 2 175.26 3 193.04 4 180.34 Name: Height, dtype: float64
# Calculate the percentiles
seventy_fifth = df['Height'].quantile(0.75)
twenty_fifth = df['Height'].quantile(0.25)
# Obtain IQR
iqr = seventy_fifth - twenty_fifth
# Upper and lower thresholds
upper = seventy_fifth + (1.5 * iqr)
lower = twenty_fifth - (1.5 * iqr)
# Subset the dataset
outliers = df[(df['Height'] < lower) | (df['Height'] > upper)]
outliers.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 463 | 463 | 203841 | N. Pope | 26 | https://cdn.sofifa.org/players/4/19/203841.png | England | https://cdn.sofifa.org/flags/14.png | 80 | 83 | Burnley | ... | 62.0 | 14.0 | 15.0 | 14.0 | 79.0 | 80.0 | 74.0 | 80.0 | 80.0 | €25.7M |
| 1204 | 1204 | 220932 | L. Kalinić | 28 | https://cdn.sofifa.org/players/4/19/220932.png | Croatia | https://cdn.sofifa.org/flags/10.png | 76 | 78 | KAA Gent | ... | 58.0 | 13.0 | 17.0 | 17.0 | 81.0 | 75.0 | 42.0 | 76.0 | 80.0 | €10.2M |
| 1340 | 1340 | 183895 | M. Moralez | 31 | https://cdn.sofifa.org/players/4/19/183895.png | Argentina | https://cdn.sofifa.org/flags/52.png | 76 | 76 | New York City FC | ... | 76.0 | 48.0 | 58.0 | 51.0 | 6.0 | 5.0 | 14.0 | 14.0 | 9.0 | €10.5M |
| 1454 | 1454 | 172203 | F. Forster | 30 | https://cdn.sofifa.org/players/4/19/172203.png | England | https://cdn.sofifa.org/flags/14.png | 76 | 77 | Southampton | ... | 56.0 | 11.0 | 13.0 | 12.0 | 72.0 | 76.0 | 64.0 | 77.0 | 78.0 | €10.9M |
| 2493 | 2493 | 202184 | J. Plata | 26 | https://cdn.sofifa.org/players/4/19/202184.png | Ecuador | https://cdn.sofifa.org/flags/57.png | 74 | 74 | Real Salt Lake | ... | 70.0 | 34.0 | 29.0 | 27.0 | 11.0 | 15.0 | 9.0 | 13.0 | 13.0 | €9M |
5 rows × 89 columns
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation
# Obtain the MAD value
mad_score = median_abs_deviation(df['Height'])
mad_score
5.079999999999984
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation
# Obtain the MAD value
mad_score = median_abs_deviation(df['Weight'])
mad_score
11.0
# Create a box plot
g = sns.boxplot(data = df, x = 'Height')
# Add a title and change xlabel
g.set_title('Box Plot of Height')
g.set_xlabel('Height')
Text(0.5, 0, 'Height')
# Import zscore function
from scipy.stats import zscore
# Calculate z-score for each data point and compute its absolute value
z_scores = zscore(df['Height'])
abs_z_scores = np.abs(z_scores)
# Select the outliers using a threshold of 3
outliers = df[abs_z_scores > 3]
outliers.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1132 | 1132 | 221634 | L. Acosta | 24 | https://cdn.sofifa.org/players/4/19/221634.png | Argentina | https://cdn.sofifa.org/flags/52.png | 77 | 79 | DC United | ... | 74.0 | 45.0 | 47.0 | 31.0 | 13.0 | 9.0 | 6.0 | 12.0 | 10.0 | €17.3M |
| 1165 | 1165 | 170719 | D. Buonanotte | 30 | https://cdn.sofifa.org/players/4/19/170719.png | Argentina | https://cdn.sofifa.org/flags/52.png | 77 | 77 | Universidad Católica | ... | 71.0 | 35.0 | 31.0 | 18.0 | 15.0 | 16.0 | 16.0 | 9.0 | 13.0 | €11.9M |
| 1340 | 1340 | 183895 | M. Moralez | 31 | https://cdn.sofifa.org/players/4/19/183895.png | Argentina | https://cdn.sofifa.org/flags/52.png | 76 | 76 | New York City FC | ... | 76.0 | 48.0 | 58.0 | 51.0 | 6.0 | 5.0 | 14.0 | 14.0 | 9.0 | €10.5M |
| 1680 | 1680 | 214327 | V. Hernández | 29 | https://cdn.sofifa.org/players/4/19/214327.png | Colombia | https://cdn.sofifa.org/flags/56.png | 75 | 75 | Atlético Nacional | ... | 77.0 | 62.0 | 20.0 | 21.0 | 10.0 | 15.0 | 15.0 | 13.0 | 11.0 | €9.1M |
| 2493 | 2493 | 202184 | J. Plata | 26 | https://cdn.sofifa.org/players/4/19/202184.png | Ecuador | https://cdn.sofifa.org/flags/57.png | 74 | 74 | Real Salt Lake | ... | 70.0 | 34.0 | 29.0 | 27.0 | 11.0 | 15.0 | 9.0 | 13.0 | 13.0 | €9M |
5 rows × 89 columns
# Obtain number of outliers
print(f'Number of outliers: {len(outliers)}')
Number of outliers: 38
# Import zscore function
from scipy.stats import zscore
# Calculate z-score for each data point and compute its absolute value
z_scores = zscore(df['Age'])
abs_z_scores = np.abs(z_scores)
# Select the outliers using a threshold of 3
outliers = df[abs_z_scores > 3]
outliers.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 41 | 41 | 1179 | G. Buffon | 40 | https://cdn.sofifa.org/players/4/19/1179.png | Italy | https://cdn.sofifa.org/flags/27.png | 88 | 88 | Paris Saint-Germain | ... | 70.0 | 13.0 | 11.0 | 11.0 | 88.0 | 87.0 | 74.0 | 90.0 | 83.0 | €7.4M |
| 864 | 864 | 153260 | Hilton | 40 | https://cdn.sofifa.org/players/4/19/153260.png | Brazil | https://cdn.sofifa.org/flags/54.png | 78 | 78 | Montpellier HSC | ... | 70.0 | 83.0 | 77.0 | 76.0 | 12.0 | 7.0 | 11.0 | 12.0 | 13.0 | NaN |
| 1120 | 1120 | 156092 | J. Villar | 41 | https://cdn.sofifa.org/players/4/19/156092.png | Paraguay | https://cdn.sofifa.org/flags/58.png | 77 | 77 | NaN | ... | 55.0 | 13.0 | 13.0 | 14.0 | 75.0 | 75.0 | 74.0 | 78.0 | 77.0 | NaN |
| 1294 | 1294 | 14907 | A. Bizzarri | 40 | https://cdn.sofifa.org/players/4/19/14907.png | Argentina | https://cdn.sofifa.org/flags/52.png | 76 | 76 | Foggia | ... | 60.0 | 11.0 | 12.0 | 11.0 | 76.0 | 74.0 | 66.0 | 82.0 | 76.0 | €840K |
| 2821 | 2821 | 232543 | S. Bertoli | 40 | https://cdn.sofifa.org/players/4/19/232543.png | Argentina | https://cdn.sofifa.org/flags/52.png | 73 | 73 | Patronato | ... | 44.0 | 12.0 | 13.0 | 11.0 | 76.0 | 73.0 | 78.0 | 67.0 | 71.0 | €392K |
5 rows × 89 columns
# Obtain number of outliers
print(f'Number of outliers: {len(outliers)}')
Number of outliers: 22
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation
# Obtain the MAD value
mad_score = median_abs_deviation(df['Height'])
mad_score
5.079999999999984
# Calculate the percentiles
seventy_fifth = df['Height'].quantile(0.75)
twenty_fifth = df['Height'].quantile(0.25)
# Obtain IQR
iqr = seventy_fifth - twenty_fifth
# Upper and lower thresholds
upper = seventy_fifth + (1.5 * iqr)
lower = twenty_fifth - (1.5 * iqr)
# Subset the dataset
outliers = df[(df['Height'] < lower) | (df['Height'] > upper)]
outliers.head()
| Unnamed: 0 | ID | Name | Age | Photo | Nationality | Flag | Overall | Potential | Club | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 463 | 463 | 203841 | N. Pope | 26 | https://cdn.sofifa.org/players/4/19/203841.png | England | https://cdn.sofifa.org/flags/14.png | 80 | 83 | Burnley | ... | 62.0 | 14.0 | 15.0 | 14.0 | 79.0 | 80.0 | 74.0 | 80.0 | 80.0 | €25.7M |
| 1204 | 1204 | 220932 | L. Kalinić | 28 | https://cdn.sofifa.org/players/4/19/220932.png | Croatia | https://cdn.sofifa.org/flags/10.png | 76 | 78 | KAA Gent | ... | 58.0 | 13.0 | 17.0 | 17.0 | 81.0 | 75.0 | 42.0 | 76.0 | 80.0 | €10.2M |
| 1340 | 1340 | 183895 | M. Moralez | 31 | https://cdn.sofifa.org/players/4/19/183895.png | Argentina | https://cdn.sofifa.org/flags/52.png | 76 | 76 | New York City FC | ... | 76.0 | 48.0 | 58.0 | 51.0 | 6.0 | 5.0 | 14.0 | 14.0 | 9.0 | €10.5M |
| 1454 | 1454 | 172203 | F. Forster | 30 | https://cdn.sofifa.org/players/4/19/172203.png | England | https://cdn.sofifa.org/flags/14.png | 76 | 77 | Southampton | ... | 56.0 | 11.0 | 13.0 | 12.0 | 72.0 | 76.0 | 64.0 | 77.0 | 78.0 | €10.9M |
| 2493 | 2493 | 202184 | J. Plata | 26 | https://cdn.sofifa.org/players/4/19/202184.png | Ecuador | https://cdn.sofifa.org/flags/57.png | 74 | 74 | Real Salt Lake | ... | 70.0 | 34.0 | 29.0 | 27.0 | 11.0 | 15.0 | 9.0 | 13.0 | 13.0 | €9M |
5 rows × 89 columns
# Obtain number of outliers
print(f'Number of outliers: {len(outliers)}')
Number of outliers: 41
# Calculate the median
median_value = df['Height'].median()
# Impute outliers with the median
df_imputed = df.copy()
df_imputed.loc[outliers.index, 'Height'] = median_value
from scipy.stats.mstats import winsorize
df_winsorized = df.copy()
df_winsorized['Total'] = winsorize(df_winsorized['Height'],\
limits = [0.05, 0.05], inplace = True)
# Create a box plot
g = sns.boxplot(data = df, x = 'Height')
# Add a title and change xlabel
g.set_title('Box Plot of Height')
g.set_xlabel('Height')
Text(0.5, 0, 'Height')
# Scatter plot
fig, ax = plt.subplots(figsize = (6,4))
ax.scatter(df['Height'],df['Weight'])
# x-axis label
ax.set_xlabel('(Height)')
# y-axis label
ax.set_ylabel('(Weight )')
plt.show()
# Z score
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(df['Height']))
print(z)
0 1.646010
1 0.995907
2 0.891177
3 1.750741
4 0.136343
...
18202 0.891177
18203 1.373324
18204 1.268593
18205 0.513760
18206 0.513760
Name: Height, Length: 18159, dtype: float64
threshold = 2
# Position of the outlier
print(np.where(z > 2))
(array([ 14, 19, 29, 63, 92, 102, 109, 115, 151,
165, 204, 210, 225, 229, 235, 261, 274, 279,
287, 293, 303, 310, 317, 372, 380, 389, 457,
463, 466, 467, 544, 545, 582, 634, 639, 704,
718, 731, 733, 740, 748, 762, 767, 802, 874,
882, 979, 1010, 1084, 1127, 1132, 1165, 1175, 1193,
1204, 1213, 1225, 1226, 1249, 1278, 1340, 1364, 1379,
1411, 1412, 1416, 1425, 1426, 1428, 1429, 1431, 1453,
1454, 1461, 1475, 1493, 1498, 1550, 1558, 1564, 1581,
1599, 1623, 1631, 1641, 1680, 1684, 1719, 1744, 1780,
1795, 1804, 1817, 1819, 1840, 1857, 1859, 1902, 1911,
1984, 2063, 2075, 2077, 2091, 2094, 2104, 2129, 2177,
2187, 2203, 2216, 2231, 2262, 2289, 2301, 2329, 2335,
2344, 2355, 2356, 2357, 2433, 2436, 2493, 2521, 2537,
2545, 2559, 2612, 2627, 2632, 2664, 2672, 2723, 2726,
2732, 2736, 2750, 2766, 2768, 2770, 2781, 2792, 2815,
2818, 2828, 2834, 2859, 2895, 2957, 2977, 2984, 3060,
3121, 3137, 3146, 3221, 3259, 3326, 3346, 3374, 3379,
3390, 3394, 3409, 3448, 3466, 3475, 3480, 3506, 3513,
3533, 3559, 3579, 3588, 3614, 3651, 3657, 3712, 3724,
3764, 3769, 3801, 3837, 3878, 3886, 3890, 3910, 3979,
3981, 4017, 4062, 4161, 4174, 4177, 4189, 4204, 4236,
4247, 4281, 4336, 4338, 4378, 4379, 4390, 4408, 4414,
4418, 4424, 4428, 4453, 4454, 4460, 4473, 4515, 4526,
4529, 4542, 4606, 4653, 4712, 4726, 4777, 4782, 4800,
4860, 4868, 4874, 4945, 4963, 4974, 5013, 5049, 5053,
5072, 5086, 5103, 5109, 5135, 5145, 5182, 5191, 5193,
5205, 5211, 5216, 5244, 5285, 5307, 5345, 5350, 5402,
5422, 5428, 5431, 5441, 5443, 5469, 5479, 5488, 5491,
5521, 5539, 5544, 5553, 5566, 5569, 5575, 5578, 5579,
5620, 5622, 5629, 5643, 5669, 5680, 5693, 5696, 5736,
5785, 5790, 5822, 5828, 5833, 5843, 5851, 5852, 5874,
5884, 5921, 5935, 5958, 5963, 5983, 5986, 5987, 6008,
6028, 6038, 6083, 6101, 6110, 6115, 6121, 6124, 6143,
6145, 6152, 6166, 6173, 6195, 6238, 6256, 6263, 6275,
6284, 6296, 6304, 6340, 6350, 6353, 6362, 6375, 6376,
6394, 6424, 6430, 6441, 6455, 6475, 6488, 6535, 6569,
6574, 6584, 6627, 6628, 6631, 6667, 6673, 6694, 6695,
6699, 6704, 6713, 6732, 6761, 6782, 6788, 6813, 6814,
6823, 6880, 6893, 6895, 6915, 6982, 6985, 6994, 7009,
7012, 7025, 7117, 7126, 7141, 7156, 7174, 7181, 7279,
7348, 7353, 7356, 7409, 7410, 7413, 7431, 7484, 7509,
7545, 7558, 7578, 7594, 7613, 7681, 7713, 7762, 7763,
7785, 7788, 7806, 7809, 7829, 7865, 7909, 7956, 7962,
7968, 7998, 8005, 8040, 8059, 8117, 8119, 8148, 8151,
8166, 8168, 8191, 8196, 8200, 8217, 8223, 8269, 8281,
8321, 8334, 8348, 8389, 8392, 8393, 8410, 8419, 8426,
8451, 8480, 8495, 8496, 8504, 8533, 8534, 8536, 8646,
8667, 8674, 8690, 8706, 8727, 8735, 8740, 8766, 8772,
8781, 8813, 8820, 8838, 8845, 8859, 8867, 8874, 8908,
8930, 8935, 8943, 8946, 8960, 8991, 9009, 9047, 9060,
9126, 9159, 9233, 9258, 9287, 9350, 9356, 9363, 9375,
9384, 9427, 9441, 9458, 9485, 9491, 9510, 9521, 9559,
9570, 9583, 9588, 9639, 9648, 9666, 9710, 9740, 9757,
9759, 9768, 9769, 9795, 9809, 9823, 9864, 9907, 9993,
9997, 10000, 10046, 10075, 10080, 10113, 10157, 10180, 10214,
10227, 10239, 10249, 10251, 10282, 10348, 10376, 10387, 10426,
10440, 10465, 10537, 10558, 10637, 10653, 10666, 10699, 10705,
10747, 10835, 10865, 10875, 10896, 10964, 10974, 11001, 11042,
11071, 11077, 11096, 11112, 11126, 11127, 11154, 11158, 11199,
11226, 11277, 11318, 11320, 11340, 11345, 11375, 11376, 11403,
11418, 11432, 11436, 11487, 11495, 11503, 11517, 11537, 11598,
11614, 11620, 11625, 11642, 11655, 11767, 11783, 11796, 11820,
11866, 11903, 11923, 11934, 11935, 11973, 12049, 12068, 12075,
12124, 12158, 12163, 12169, 12186, 12257, 12262, 12266, 12308,
12318, 12437, 12458, 12460, 12468, 12470, 12480, 12494, 12531,
12537, 12564, 12581, 12587, 12590, 12630, 12631, 12632, 12636,
12666, 12668, 12751, 12781, 12817, 12832, 12853, 12854, 12941,
12951, 12959, 12967, 12975, 12987, 13063, 13080, 13087, 13109,
13112, 13178, 13180, 13233, 13237, 13290, 13303, 13308, 13322,
13328, 13362, 13383, 13407, 13436, 13449, 13472, 13477, 13488,
13489, 13492, 13493, 13502, 13549, 13556, 13559, 13695, 13738,
13745, 13747, 13754, 13762, 13806, 13890, 13905, 13917, 13942,
13950, 14036, 14038, 14053, 14059, 14062, 14063, 14069, 14071,
14085, 14086, 14113, 14121, 14122, 14143, 14178, 14231, 14245,
14253, 14256, 14269, 14270, 14296, 14299, 14326, 14335, 14383,
14388, 14389, 14411, 14414, 14421, 14432, 14463, 14494, 14496,
14500, 14506, 14518, 14538, 14567, 14568, 14584, 14591, 14602,
14646, 14647, 14663, 14680, 14727, 14734, 14789, 14791, 14822,
14846, 14868, 14922, 14932, 14944, 15026, 15050, 15068, 15104,
15115, 15133, 15175, 15177, 15181, 15192, 15195, 15221, 15228,
15231, 15246, 15250, 15271, 15278, 15295, 15298, 15301, 15305,
15306, 15326, 15331, 15335, 15337, 15339, 15365, 15384, 15396,
15413, 15483, 15485, 15496, 15506, 15507, 15537, 15554, 15572,
15576, 15588, 15603, 15616, 15637, 15639, 15671, 15716, 15726,
15749, 15780, 15827, 15830, 15843, 15850, 15855, 15892, 15930,
15931, 15942, 15989, 16018, 16042, 16044, 16059, 16081, 16083,
16129, 16161, 16218, 16219, 16236, 16255, 16261, 16291, 16372,
16401, 16407, 16413, 16428, 16435, 16455, 16483, 16493, 16498,
16521, 16526, 16531, 16536, 16537, 16546, 16560, 16563, 16582,
16646, 16647, 16704, 16721, 16752, 16758, 16763, 16790, 16796,
16801, 16820, 16829, 16854, 16856, 16875, 16877, 16899, 16917,
16930, 16954, 16990, 17008, 17042, 17051, 17052, 17068, 17069,
17071, 17084, 17103, 17109, 17127, 17156, 17174, 17180, 17187,
17221, 17227, 17239, 17296, 17299, 17331, 17334, 17335, 17340,
17369, 17395, 17398, 17419, 17423, 17449, 17453, 17462, 17484,
17521, 17542, 17546, 17555, 17571, 17595, 17609, 17646, 17698,
17725, 17738, 17770, 17801, 17813, 17872, 17873, 17879, 17880,
17906, 18002, 18022, 18046, 18110, 18111, 18151], dtype=int64),)
# IQR
Q1 = np.percentile(df['Height'], 25, method='midpoint')
Q3 = np.percentile(df['Height'], 75, method='midpoint')
IQR = Q3 - Q1
print(IQR)
10.159999999999997
# Above Upper bound
upper=Q3+1.5*IQR
upper_array=np.array(df['Height']>=upper)
print("Upper Bound:",upper)
print(upper_array.sum())
#Below Lower bound
lower=Q1-1.5*IQR
lower_array=np.array(df['Height']<=lower)
print("Lower Bound:",lower)
print(lower_array.sum())
Upper Bound: 200.65999999999997 33 Lower Bound: 160.01999999999998 8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
df['Height'].hist(figsize=(15,10));
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
sns.heatmap(df.corr(), annot=True)
<Axes: >
ax = sns.lmplot(x = "Height",
y = "Weight",
data = df, hue = "Age", fit_reg = False, height = 5, aspect = 2.2)
sns.regplot(x = "Height",
y = "Weight",
data = df, scatter=False, ax=ax.axes[0, 0], order = 3)
plt.ylabel("Weight")
plt.xticks(list(range(1,30)), list(df['Height'].unique()))
plt.title("Relationship Between Height and Weight", fontsize=18)
plt.xlabel("Height", fontsize=14)
plt.ylabel("Weight", fontsize=14)
plt.show()
--------------------------------------------------------------------------- UFuncTypeError Traceback (most recent call last) Cell In[325], line 4 1 ax = sns.lmplot(x = "Height", 2 y = "Weight", 3 data = df, hue = "Age", fit_reg = False, height = 5, aspect = 2.2) ----> 4 sns.regplot(x = "Height", 5 y = "Weight", 6 data = df, scatter=False, ax=ax.axes[0, 0], order = 3) 7 plt.ylabel("Weight") 8 plt.xticks(list(range(1,30)), list(df['Height'].unique())) File ~\anaconda3\Lib\site-packages\seaborn\regression.py:759, in regplot(data, x, y, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, seed, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax) 757 scatter_kws["marker"] = marker 758 line_kws = {} if line_kws is None else copy.copy(line_kws) --> 759 plotter.plot(ax, scatter_kws, line_kws) 760 return ax File ~\anaconda3\Lib\site-packages\seaborn\regression.py:368, in _RegressionPlotter.plot(self, ax, scatter_kws, line_kws) 365 self.scatterplot(ax, scatter_kws) 367 if self.fit_reg: --> 368 self.lineplot(ax, line_kws) 370 # Label the axes 371 if hasattr(self.x, "name"): File ~\anaconda3\Lib\site-packages\seaborn\regression.py:413, in _RegressionPlotter.lineplot(self, ax, kws) 411 """Draw the model.""" 412 # Fit the regression model --> 413 grid, yhat, err_bands = self.fit_regression(ax) 414 edges = grid[0], grid[-1] 416 # Get set default aesthetics File ~\anaconda3\Lib\site-packages\seaborn\regression.py:204, in _RegressionPlotter.fit_regression(self, ax, x_range, grid) 202 # Fit the regression 203 if self.order > 1: --> 204 yhat, yhat_boots = self.fit_poly(grid, self.order) 205 elif self.logistic: 206 from statsmodels.genmod.generalized_linear_model import GLM File ~\anaconda3\Lib\site-packages\seaborn\regression.py:254, in _RegressionPlotter.fit_poly(self, grid, order) 251 return np.polyval(np.polyfit(_x, _y, order), grid) 253 x, y = self.x, self.y --> 254 yhat = reg_func(x, y) 255 if self.ci is None: 256 return yhat, None File ~\anaconda3\Lib\site-packages\seaborn\regression.py:251, in _RegressionPlotter.fit_poly.<locals>.reg_func(_x, _y) 250 def reg_func(_x, _y): --> 251 return np.polyval(np.polyfit(_x, _y, order), grid) File <__array_function__ internals>:200, in polyfit(*args, **kwargs) File ~\anaconda3\Lib\site-packages\numpy\lib\polynomial.py:668, in polyfit(x, y, deg, rcond, full, w, cov) 666 scale = NX.sqrt((lhs*lhs).sum(axis=0)) 667 lhs /= scale --> 668 c, resids, rank, s = lstsq(lhs, rhs, rcond) 669 c = (c.T/scale).T # broadcast scale coefficients 671 # warn on rank reduction, which indicates an ill conditioned matrix File <__array_function__ internals>:200, in lstsq(*args, **kwargs) File ~\anaconda3\Lib\site-packages\numpy\linalg\linalg.py:2285, in lstsq(a, b, rcond) 2282 if n_rhs == 0: 2283 # lapack can't handle n_rhs = 0 - so allocate the array one larger in that axis 2284 b = zeros(b.shape[:-2] + (m, n_rhs + 1), dtype=b.dtype) -> 2285 x, resids, rank, s = gufunc(a, b, rcond, signature=signature, extobj=extobj) 2286 if m == 0: 2287 x[...] = 0 UFuncTypeError: Cannot cast ufunc 'lstsq_n' input 1 from dtype('O') to dtype('float64') with casting rule 'same_kind'
# Find the pearson correlations matrix
corr = df.corr(method = 'pearson')
corr
| Unnamed: 0 | ID | Age | Overall | Potential | Special | International Reputation | Weak Foot | Skill Moves | Jersey Number | ... | Penalties | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Unnamed: 0 | 1.000000 | 0.416108 | -0.455707 | -0.972796 | -0.633444 | -0.596913 | -0.413322 | -0.204026 | -0.415672 | 0.213645 | ... | -0.337899 | -0.716173 | -0.280829 | -0.248564 | -0.220034 | 0.027757 | 0.027026 | 0.030654 | 0.019842 | 0.025610 |
| ID | 0.416108 | 1.000000 | -0.739162 | -0.417354 | 0.046577 | -0.231366 | -0.356191 | -0.075784 | -0.056914 | 0.182074 | ... | -0.140657 | -0.384473 | -0.110198 | -0.085929 | -0.068409 | -0.105594 | -0.111149 | -0.106652 | -0.118250 | -0.105778 |
| Age | -0.455707 | -0.739162 | 1.000000 | 0.453069 | -0.252281 | 0.236875 | 0.253765 | 0.059867 | 0.027649 | -0.241156 | ... | 0.139535 | 0.391023 | 0.142817 | 0.119745 | 0.103089 | 0.101277 | 0.106419 | 0.104964 | 0.116402 | 0.103313 |
| Overall | -0.972796 | -0.417354 | 0.453069 | 1.000000 | 0.661180 | 0.607236 | 0.499491 | 0.212015 | 0.414463 | -0.218931 | ... | 0.341429 | 0.727655 | 0.286505 | 0.252629 | 0.222811 | -0.025937 | -0.025062 | -0.029372 | -0.017674 | -0.023276 |
| Potential | -0.633444 | 0.046577 | -0.252281 | 0.661180 | 1.000000 | 0.384598 | 0.372993 | 0.162346 | 0.354290 | -0.010474 | ... | 0.224281 | 0.440008 | 0.162801 | 0.143564 | 0.128980 | -0.053446 | -0.054672 | -0.059061 | -0.052589 | -0.053341 |
| Special | -0.596913 | -0.231366 | 0.236875 | 0.607236 | 0.384598 | 1.000000 | 0.292208 | 0.341855 | 0.763412 | -0.133716 | ... | 0.734533 | 0.752331 | 0.561866 | 0.538802 | 0.506968 | -0.674637 | -0.673625 | -0.670254 | -0.668272 | -0.673238 |
| International Reputation | -0.413322 | -0.356191 | 0.253765 | 0.499491 | 0.372993 | 0.292208 | 1.000000 | 0.128317 | 0.208153 | -0.077298 | ... | 0.218620 | 0.392787 | 0.115208 | 0.092846 | 0.079176 | 0.004526 | 0.003942 | 0.000651 | 0.006904 | 0.003444 |
| Weak Foot | -0.204026 | -0.075784 | 0.059867 | 0.212015 | 0.162346 | 0.341855 | 0.128317 | 1.000000 | 0.340721 | -0.035410 | ... | 0.330252 | 0.278132 | 0.065673 | 0.042646 | 0.026105 | -0.231905 | -0.233098 | -0.229395 | -0.231298 | -0.232574 |
| Skill Moves | -0.415672 | -0.056914 | 0.027649 | 0.414463 | 0.354290 | 0.763412 | 0.208153 | 0.340721 | 1.000000 | -0.035194 | ... | 0.690434 | 0.586836 | 0.241428 | 0.210517 | 0.178607 | -0.621675 | -0.619755 | -0.616990 | -0.618853 | -0.621925 |
| Jersey Number | 0.213645 | 0.182074 | -0.241156 | -0.218931 | -0.010474 | -0.133716 | -0.077298 | -0.035410 | -0.035194 | 1.000000 | ... | -0.028023 | -0.167523 | -0.142474 | -0.133285 | -0.124610 | 0.004807 | 0.001543 | 0.001162 | -0.002736 | 0.003255 |
| Height | -0.034734 | -0.090199 | 0.082604 | 0.038546 | -0.009807 | -0.382862 | 0.034881 | -0.174793 | -0.422753 | -0.039469 | ... | -0.340664 | -0.135785 | -0.073733 | -0.058877 | -0.066869 | 0.360594 | 0.360796 | 0.358780 | 0.361916 | 0.362636 |
| Weight | -0.150844 | -0.191425 | 0.230213 | 0.154634 | -0.006947 | -0.267830 | 0.088340 | -0.130724 | -0.351209 | -0.087319 | ... | -0.253387 | -0.034444 | -0.049356 | -0.046835 | -0.056164 | 0.340034 | 0.339024 | 0.337717 | 0.342178 | 0.341135 |
| Crossing | -0.390062 | -0.131994 | 0.130545 | 0.394972 | 0.246319 | 0.866417 | 0.191770 | 0.307925 | 0.741035 | -0.076585 | ... | 0.645805 | 0.575446 | 0.443101 | 0.428963 | 0.409961 | -0.663053 | -0.660193 | -0.659767 | -0.660160 | -0.662539 |
| Finishing | -0.325529 | -0.082323 | 0.068660 | 0.332515 | 0.243355 | 0.724244 | 0.178373 | 0.357416 | 0.743439 | -0.006639 | ... | 0.837827 | 0.533414 | 0.024218 | -0.033023 | -0.071811 | -0.588752 | -0.587145 | -0.583268 | -0.584852 | -0.586913 |
| HeadingAccuracy | -0.337766 | -0.106815 | 0.147183 | 0.340776 | 0.200988 | 0.644421 | 0.157483 | 0.183238 | 0.443005 | -0.091688 | ... | 0.551978 | 0.507208 | 0.583123 | 0.561063 | 0.533643 | -0.750417 | -0.749888 | -0.746444 | -0.744443 | -0.748895 |
| ShortPassing | -0.492495 | -0.136279 | 0.132894 | 0.502550 | 0.369189 | 0.906729 | 0.242803 | 0.322133 | 0.730363 | -0.100241 | ... | 0.676063 | 0.685137 | 0.559576 | 0.541131 | 0.508644 | -0.729785 | -0.728024 | -0.724381 | -0.723782 | -0.728721 |
| Volleys | -0.384285 | -0.159915 | 0.142472 | 0.391338 | 0.254906 | 0.773974 | 0.243089 | 0.357340 | 0.745077 | -0.026731 | ... | 0.829257 | 0.595281 | 0.120919 | 0.072788 | 0.035457 | -0.590808 | -0.588668 | -0.584954 | -0.586131 | -0.588670 |
| Dribbling | -0.364106 | -0.030340 | 0.010166 | 0.372426 | 0.315019 | 0.874274 | 0.179041 | 0.352658 | 0.839757 | -0.028021 | ... | 0.769594 | 0.597498 | 0.336072 | 0.301251 | 0.273963 | -0.754625 | -0.753181 | -0.749816 | -0.751348 | -0.754341 |
| Curve | -0.416378 | -0.169511 | 0.143276 | 0.419491 | 0.279944 | 0.851900 | 0.233681 | 0.345468 | 0.771052 | -0.055428 | ... | 0.751833 | 0.616532 | 0.289529 | 0.261481 | 0.232869 | -0.606286 | -0.603141 | -0.600266 | -0.603540 | -0.604960 |
| FKAccuracy | -0.395433 | -0.199549 | 0.193467 | 0.396892 | 0.230544 | 0.806414 | 0.223564 | 0.330472 | 0.701068 | -0.068843 | ... | 0.734440 | 0.585120 | 0.297976 | 0.279153 | 0.247903 | -0.556605 | -0.553644 | -0.549911 | -0.552641 | -0.554920 |
| LongPassing | -0.477500 | -0.186764 | 0.181310 | 0.483909 | 0.321437 | 0.846302 | 0.239525 | 0.277174 | 0.622342 | -0.117424 | ... | 0.542247 | 0.645797 | 0.587106 | 0.587430 | 0.562230 | -0.596820 | -0.594999 | -0.591453 | -0.591561 | -0.595887 |
| BallControl | -0.449655 | -0.100184 | 0.084969 | 0.460197 | 0.354396 | 0.912107 | 0.217946 | 0.356383 | 0.818051 | -0.073210 | ... | 0.769791 | 0.674881 | 0.452705 | 0.417566 | 0.384802 | -0.788444 | -0.786797 | -0.783423 | -0.783607 | -0.787939 |
| Acceleration | -0.185030 | 0.133236 | -0.158667 | 0.196869 | 0.234608 | 0.654337 | 0.044319 | 0.261435 | 0.652356 | -0.004395 | ... | 0.532908 | 0.347427 | 0.195369 | 0.163000 | 0.157565 | -0.593008 | -0.594866 | -0.592127 | -0.592143 | -0.593201 |
| SprintSpeed | -0.198797 | 0.132437 | -0.151682 | 0.210647 | 0.236771 | 0.645963 | 0.044070 | 0.248822 | 0.624098 | -0.015069 | ... | 0.521071 | 0.351607 | 0.212575 | 0.178214 | 0.171980 | -0.597677 | -0.599694 | -0.597320 | -0.596498 | -0.597837 |
| Agility | -0.256270 | -0.019897 | -0.019395 | 0.264952 | 0.222310 | 0.699673 | 0.100869 | 0.302062 | 0.681765 | -0.034158 | ... | 0.566175 | 0.432511 | 0.167122 | 0.129204 | 0.116686 | -0.527756 | -0.528482 | -0.527164 | -0.526983 | -0.528899 |
| Reactions | -0.832156 | -0.408617 | 0.453124 | 0.850045 | 0.513425 | 0.597169 | 0.445614 | 0.201341 | 0.377044 | -0.192622 | ... | 0.346143 | 0.685558 | 0.283607 | 0.255399 | 0.228355 | -0.062967 | -0.061940 | -0.065927 | -0.055031 | -0.059961 |
| Balance | -0.097160 | 0.048463 | -0.089877 | 0.103160 | 0.138025 | 0.586788 | 0.050076 | 0.254022 | 0.578459 | 0.008009 | ... | 0.482794 | 0.310763 | 0.178695 | 0.154045 | 0.152470 | -0.504727 | -0.506102 | -0.503970 | -0.503652 | -0.505974 |
| ShotPower | -0.440023 | -0.166133 | 0.156947 | 0.441118 | 0.288318 | 0.835277 | 0.227772 | 0.332855 | 0.718237 | -0.053860 | ... | 0.795220 | 0.634495 | 0.296944 | 0.256403 | 0.220237 | -0.654117 | -0.654099 | -0.649403 | -0.651409 | -0.653475 |
| Jumping | -0.261581 | -0.169369 | 0.177167 | 0.264435 | 0.109151 | 0.321846 | 0.120931 | 0.069752 | 0.107553 | -0.104179 | ... | 0.133294 | 0.252353 | 0.279196 | 0.260645 | 0.260261 | -0.192700 | -0.193692 | -0.195282 | -0.189079 | -0.192050 |
| Stamina | -0.358451 | -0.053895 | 0.097793 | 0.365656 | 0.202563 | 0.792762 | 0.094780 | 0.232094 | 0.570226 | -0.127822 | ... | 0.516426 | 0.523112 | 0.587782 | 0.570055 | 0.544702 | -0.701467 | -0.698556 | -0.696729 | -0.696073 | -0.699670 |
| Strength | -0.342839 | -0.259756 | 0.332798 | 0.349326 | 0.075769 | 0.192990 | 0.131280 | -0.008470 | -0.041475 | -0.158411 | ... | 0.054491 | 0.280522 | 0.333334 | 0.332159 | 0.304849 | -0.111012 | -0.109660 | -0.110253 | -0.103878 | -0.107497 |
| LongShots | -0.417853 | -0.161549 | 0.155096 | 0.420795 | 0.266740 | 0.840049 | 0.213960 | 0.355967 | 0.752980 | -0.046174 | ... | 0.812446 | 0.616102 | 0.215510 | 0.172331 | 0.133603 | -0.612381 | -0.610739 | -0.605952 | -0.607200 | -0.610087 |
| Aggression | -0.397067 | -0.228329 | 0.265190 | 0.395470 | 0.171174 | 0.666236 | 0.173327 | 0.131524 | 0.347795 | -0.146907 | ... | 0.336089 | 0.515776 | 0.723961 | 0.744216 | 0.721384 | -0.575843 | -0.576114 | -0.573607 | -0.571201 | -0.575142 |
| Interceptions | -0.319162 | -0.160602 | 0.197845 | 0.321326 | 0.154908 | 0.561676 | 0.129586 | 0.053097 | 0.209604 | -0.158526 | ... | 0.110834 | 0.397450 | 0.888349 | 0.941471 | 0.928282 | -0.485585 | -0.486324 | -0.485394 | -0.481279 | -0.486036 |
| Positioning | -0.351820 | -0.088330 | 0.082443 | 0.356493 | 0.245616 | 0.824307 | 0.183003 | 0.346896 | 0.781248 | -0.025422 | ... | 0.801268 | 0.580498 | 0.202597 | 0.158060 | 0.124228 | -0.679480 | -0.677699 | -0.674393 | -0.675569 | -0.678582 |
| Vision | -0.490296 | -0.215170 | 0.187422 | 0.498894 | 0.348141 | 0.761992 | 0.284600 | 0.337897 | 0.674057 | -0.078050 | ... | 0.632927 | 0.636280 | 0.176760 | 0.146460 | 0.113228 | -0.381899 | -0.377807 | -0.374737 | -0.375775 | -0.381158 |
| Penalties | -0.337899 | -0.140657 | 0.139535 | 0.341429 | 0.224281 | 0.734533 | 0.218620 | 0.330252 | 0.690434 | -0.028023 | ... | 1.000000 | 0.551801 | 0.152296 | 0.101920 | 0.066693 | -0.620069 | -0.618968 | -0.614006 | -0.617074 | -0.619099 |
| Composure | -0.716173 | -0.384473 | 0.391023 | 0.727655 | 0.440008 | 0.752331 | 0.392787 | 0.278132 | 0.586836 | -0.167523 | ... | 0.551801 | 1.000000 | 0.384081 | 0.351726 | 0.317492 | -0.378750 | -0.375720 | -0.374897 | -0.370234 | -0.377626 |
| Marking | -0.280829 | -0.110198 | 0.142817 | 0.286505 | 0.162801 | 0.561866 | 0.115208 | 0.065673 | 0.241428 | -0.142474 | ... | 0.152296 | 0.384081 | 1.000000 | 0.906541 | 0.895908 | -0.550978 | -0.552263 | -0.549498 | -0.546670 | -0.551290 |
| StandingTackle | -0.248564 | -0.085929 | 0.119745 | 0.252629 | 0.143564 | 0.538802 | 0.092846 | 0.042646 | 0.210517 | -0.133285 | ... | 0.101920 | 0.351726 | 0.906541 | 1.000000 | 0.974659 | -0.530989 | -0.532160 | -0.531092 | -0.527792 | -0.531474 |
| SlidingTackle | -0.220034 | -0.068409 | 0.103089 | 0.222811 | 0.128980 | 0.506968 | 0.079176 | 0.026105 | 0.178607 | -0.124610 | ... | 0.066693 | 0.317492 | 0.895908 | 0.974659 | 1.000000 | -0.509337 | -0.510591 | -0.509378 | -0.505792 | -0.509425 |
| GKDiving | 0.027757 | -0.105594 | 0.101277 | -0.025937 | -0.053446 | -0.674637 | 0.004526 | -0.231905 | -0.621675 | 0.004807 | ... | -0.620069 | -0.378750 | -0.550978 | -0.530989 | -0.509337 | 1.000000 | 0.970280 | 0.965685 | 0.969864 | 0.973320 |
| GKHandling | 0.027026 | -0.111149 | 0.106419 | -0.025062 | -0.054672 | -0.673625 | 0.003942 | -0.233098 | -0.619755 | 0.001543 | ... | -0.618968 | -0.375720 | -0.552263 | -0.532160 | -0.510591 | 0.970280 | 1.000000 | 0.965239 | 0.969408 | 0.970264 |
| GKKicking | 0.030654 | -0.106652 | 0.104964 | -0.029372 | -0.059061 | -0.670254 | 0.000651 | -0.229395 | -0.616990 | 0.001162 | ... | -0.614006 | -0.374897 | -0.549498 | -0.531092 | -0.509378 | 0.965685 | 0.965239 | 1.000000 | 0.964336 | 0.966337 |
| GKPositioning | 0.019842 | -0.118250 | 0.116402 | -0.017674 | -0.052589 | -0.668272 | 0.006904 | -0.231298 | -0.618853 | -0.002736 | ... | -0.617074 | -0.370234 | -0.546670 | -0.527792 | -0.505792 | 0.969864 | 0.969408 | 0.964336 | 1.000000 | 0.970130 |
| GKReflexes | 0.025610 | -0.105778 | 0.103313 | -0.023276 | -0.053341 | -0.673238 | 0.003444 | -0.232574 | -0.621925 | 0.003255 | ... | -0.619099 | -0.377626 | -0.551290 | -0.531474 | -0.509425 | 0.973320 | 0.970264 | 0.966337 | 0.970130 | 1.000000 |
46 rows × 46 columns
# correaltions between Potential and Overall columns
c = np.corrcoef(df['Potential'],df['Overall'])
print('Correlations between Potential and Overall\n',c)
Correlations between Potential and Overall [[1. 0.66118012] [0.66118012 1. ]]
plt.figure(figsize=(10,8), dpi =500)
sns.heatmap(corr,annot=True,fmt=".2f", linewidth=.5)
plt.show()
# correaltions between Potential and Overall columns
c = np.corrcoef(df['Age'],df['Overall'])
print('Correlations between Age and Overall\n',c)
Correlations between Age and Overall [[1. 0.45306932] [0.45306932 1. ]]
#bestfit line
sns.lmplot(x="Age", y="Overall", data=df);
#scatterplot
import seaborn as sns
sns.scatterplot(x="Age", y="Overall", data=df);
sns.lmplot(x="Age", y="Overall", hue="Potential", data=df);
#Corrleation matrix
cormat = df.corr()
round(cormat,2)
| Unnamed: 0 | ID | Age | Overall | Potential | Special | International Reputation | Weak Foot | Skill Moves | Jersey Number | ... | Penalties | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Unnamed: 0 | 1.00 | 0.42 | -0.46 | -0.97 | -0.63 | -0.60 | -0.41 | -0.20 | -0.42 | 0.21 | ... | -0.34 | -0.72 | -0.28 | -0.25 | -0.22 | 0.03 | 0.03 | 0.03 | 0.02 | 0.03 |
| ID | 0.42 | 1.00 | -0.74 | -0.42 | 0.05 | -0.23 | -0.36 | -0.08 | -0.06 | 0.18 | ... | -0.14 | -0.38 | -0.11 | -0.09 | -0.07 | -0.11 | -0.11 | -0.11 | -0.12 | -0.11 |
| Age | -0.46 | -0.74 | 1.00 | 0.45 | -0.25 | 0.24 | 0.25 | 0.06 | 0.03 | -0.24 | ... | 0.14 | 0.39 | 0.14 | 0.12 | 0.10 | 0.10 | 0.11 | 0.10 | 0.12 | 0.10 |
| Overall | -0.97 | -0.42 | 0.45 | 1.00 | 0.66 | 0.61 | 0.50 | 0.21 | 0.41 | -0.22 | ... | 0.34 | 0.73 | 0.29 | 0.25 | 0.22 | -0.03 | -0.03 | -0.03 | -0.02 | -0.02 |
| Potential | -0.63 | 0.05 | -0.25 | 0.66 | 1.00 | 0.38 | 0.37 | 0.16 | 0.35 | -0.01 | ... | 0.22 | 0.44 | 0.16 | 0.14 | 0.13 | -0.05 | -0.05 | -0.06 | -0.05 | -0.05 |
| Special | -0.60 | -0.23 | 0.24 | 0.61 | 0.38 | 1.00 | 0.29 | 0.34 | 0.76 | -0.13 | ... | 0.73 | 0.75 | 0.56 | 0.54 | 0.51 | -0.67 | -0.67 | -0.67 | -0.67 | -0.67 |
| International Reputation | -0.41 | -0.36 | 0.25 | 0.50 | 0.37 | 0.29 | 1.00 | 0.13 | 0.21 | -0.08 | ... | 0.22 | 0.39 | 0.12 | 0.09 | 0.08 | 0.00 | 0.00 | 0.00 | 0.01 | 0.00 |
| Weak Foot | -0.20 | -0.08 | 0.06 | 0.21 | 0.16 | 0.34 | 0.13 | 1.00 | 0.34 | -0.04 | ... | 0.33 | 0.28 | 0.07 | 0.04 | 0.03 | -0.23 | -0.23 | -0.23 | -0.23 | -0.23 |
| Skill Moves | -0.42 | -0.06 | 0.03 | 0.41 | 0.35 | 0.76 | 0.21 | 0.34 | 1.00 | -0.04 | ... | 0.69 | 0.59 | 0.24 | 0.21 | 0.18 | -0.62 | -0.62 | -0.62 | -0.62 | -0.62 |
| Jersey Number | 0.21 | 0.18 | -0.24 | -0.22 | -0.01 | -0.13 | -0.08 | -0.04 | -0.04 | 1.00 | ... | -0.03 | -0.17 | -0.14 | -0.13 | -0.12 | 0.00 | 0.00 | 0.00 | -0.00 | 0.00 |
| Height | -0.03 | -0.09 | 0.08 | 0.04 | -0.01 | -0.38 | 0.03 | -0.17 | -0.42 | -0.04 | ... | -0.34 | -0.14 | -0.07 | -0.06 | -0.07 | 0.36 | 0.36 | 0.36 | 0.36 | 0.36 |
| Weight | -0.15 | -0.19 | 0.23 | 0.15 | -0.01 | -0.27 | 0.09 | -0.13 | -0.35 | -0.09 | ... | -0.25 | -0.03 | -0.05 | -0.05 | -0.06 | 0.34 | 0.34 | 0.34 | 0.34 | 0.34 |
| Crossing | -0.39 | -0.13 | 0.13 | 0.39 | 0.25 | 0.87 | 0.19 | 0.31 | 0.74 | -0.08 | ... | 0.65 | 0.58 | 0.44 | 0.43 | 0.41 | -0.66 | -0.66 | -0.66 | -0.66 | -0.66 |
| Finishing | -0.33 | -0.08 | 0.07 | 0.33 | 0.24 | 0.72 | 0.18 | 0.36 | 0.74 | -0.01 | ... | 0.84 | 0.53 | 0.02 | -0.03 | -0.07 | -0.59 | -0.59 | -0.58 | -0.58 | -0.59 |
| HeadingAccuracy | -0.34 | -0.11 | 0.15 | 0.34 | 0.20 | 0.64 | 0.16 | 0.18 | 0.44 | -0.09 | ... | 0.55 | 0.51 | 0.58 | 0.56 | 0.53 | -0.75 | -0.75 | -0.75 | -0.74 | -0.75 |
| ShortPassing | -0.49 | -0.14 | 0.13 | 0.50 | 0.37 | 0.91 | 0.24 | 0.32 | 0.73 | -0.10 | ... | 0.68 | 0.69 | 0.56 | 0.54 | 0.51 | -0.73 | -0.73 | -0.72 | -0.72 | -0.73 |
| Volleys | -0.38 | -0.16 | 0.14 | 0.39 | 0.25 | 0.77 | 0.24 | 0.36 | 0.75 | -0.03 | ... | 0.83 | 0.60 | 0.12 | 0.07 | 0.04 | -0.59 | -0.59 | -0.58 | -0.59 | -0.59 |
| Dribbling | -0.36 | -0.03 | 0.01 | 0.37 | 0.32 | 0.87 | 0.18 | 0.35 | 0.84 | -0.03 | ... | 0.77 | 0.60 | 0.34 | 0.30 | 0.27 | -0.75 | -0.75 | -0.75 | -0.75 | -0.75 |
| Curve | -0.42 | -0.17 | 0.14 | 0.42 | 0.28 | 0.85 | 0.23 | 0.35 | 0.77 | -0.06 | ... | 0.75 | 0.62 | 0.29 | 0.26 | 0.23 | -0.61 | -0.60 | -0.60 | -0.60 | -0.60 |
| FKAccuracy | -0.40 | -0.20 | 0.19 | 0.40 | 0.23 | 0.81 | 0.22 | 0.33 | 0.70 | -0.07 | ... | 0.73 | 0.59 | 0.30 | 0.28 | 0.25 | -0.56 | -0.55 | -0.55 | -0.55 | -0.55 |
| LongPassing | -0.48 | -0.19 | 0.18 | 0.48 | 0.32 | 0.85 | 0.24 | 0.28 | 0.62 | -0.12 | ... | 0.54 | 0.65 | 0.59 | 0.59 | 0.56 | -0.60 | -0.59 | -0.59 | -0.59 | -0.60 |
| BallControl | -0.45 | -0.10 | 0.08 | 0.46 | 0.35 | 0.91 | 0.22 | 0.36 | 0.82 | -0.07 | ... | 0.77 | 0.67 | 0.45 | 0.42 | 0.38 | -0.79 | -0.79 | -0.78 | -0.78 | -0.79 |
| Acceleration | -0.19 | 0.13 | -0.16 | 0.20 | 0.23 | 0.65 | 0.04 | 0.26 | 0.65 | -0.00 | ... | 0.53 | 0.35 | 0.20 | 0.16 | 0.16 | -0.59 | -0.59 | -0.59 | -0.59 | -0.59 |
| SprintSpeed | -0.20 | 0.13 | -0.15 | 0.21 | 0.24 | 0.65 | 0.04 | 0.25 | 0.62 | -0.02 | ... | 0.52 | 0.35 | 0.21 | 0.18 | 0.17 | -0.60 | -0.60 | -0.60 | -0.60 | -0.60 |
| Agility | -0.26 | -0.02 | -0.02 | 0.26 | 0.22 | 0.70 | 0.10 | 0.30 | 0.68 | -0.03 | ... | 0.57 | 0.43 | 0.17 | 0.13 | 0.12 | -0.53 | -0.53 | -0.53 | -0.53 | -0.53 |
| Reactions | -0.83 | -0.41 | 0.45 | 0.85 | 0.51 | 0.60 | 0.45 | 0.20 | 0.38 | -0.19 | ... | 0.35 | 0.69 | 0.28 | 0.26 | 0.23 | -0.06 | -0.06 | -0.07 | -0.06 | -0.06 |
| Balance | -0.10 | 0.05 | -0.09 | 0.10 | 0.14 | 0.59 | 0.05 | 0.25 | 0.58 | 0.01 | ... | 0.48 | 0.31 | 0.18 | 0.15 | 0.15 | -0.50 | -0.51 | -0.50 | -0.50 | -0.51 |
| ShotPower | -0.44 | -0.17 | 0.16 | 0.44 | 0.29 | 0.84 | 0.23 | 0.33 | 0.72 | -0.05 | ... | 0.80 | 0.63 | 0.30 | 0.26 | 0.22 | -0.65 | -0.65 | -0.65 | -0.65 | -0.65 |
| Jumping | -0.26 | -0.17 | 0.18 | 0.26 | 0.11 | 0.32 | 0.12 | 0.07 | 0.11 | -0.10 | ... | 0.13 | 0.25 | 0.28 | 0.26 | 0.26 | -0.19 | -0.19 | -0.20 | -0.19 | -0.19 |
| Stamina | -0.36 | -0.05 | 0.10 | 0.37 | 0.20 | 0.79 | 0.09 | 0.23 | 0.57 | -0.13 | ... | 0.52 | 0.52 | 0.59 | 0.57 | 0.54 | -0.70 | -0.70 | -0.70 | -0.70 | -0.70 |
| Strength | -0.34 | -0.26 | 0.33 | 0.35 | 0.08 | 0.19 | 0.13 | -0.01 | -0.04 | -0.16 | ... | 0.05 | 0.28 | 0.33 | 0.33 | 0.30 | -0.11 | -0.11 | -0.11 | -0.10 | -0.11 |
| LongShots | -0.42 | -0.16 | 0.16 | 0.42 | 0.27 | 0.84 | 0.21 | 0.36 | 0.75 | -0.05 | ... | 0.81 | 0.62 | 0.22 | 0.17 | 0.13 | -0.61 | -0.61 | -0.61 | -0.61 | -0.61 |
| Aggression | -0.40 | -0.23 | 0.27 | 0.40 | 0.17 | 0.67 | 0.17 | 0.13 | 0.35 | -0.15 | ... | 0.34 | 0.52 | 0.72 | 0.74 | 0.72 | -0.58 | -0.58 | -0.57 | -0.57 | -0.58 |
| Interceptions | -0.32 | -0.16 | 0.20 | 0.32 | 0.15 | 0.56 | 0.13 | 0.05 | 0.21 | -0.16 | ... | 0.11 | 0.40 | 0.89 | 0.94 | 0.93 | -0.49 | -0.49 | -0.49 | -0.48 | -0.49 |
| Positioning | -0.35 | -0.09 | 0.08 | 0.36 | 0.25 | 0.82 | 0.18 | 0.35 | 0.78 | -0.03 | ... | 0.80 | 0.58 | 0.20 | 0.16 | 0.12 | -0.68 | -0.68 | -0.67 | -0.68 | -0.68 |
| Vision | -0.49 | -0.22 | 0.19 | 0.50 | 0.35 | 0.76 | 0.28 | 0.34 | 0.67 | -0.08 | ... | 0.63 | 0.64 | 0.18 | 0.15 | 0.11 | -0.38 | -0.38 | -0.37 | -0.38 | -0.38 |
| Penalties | -0.34 | -0.14 | 0.14 | 0.34 | 0.22 | 0.73 | 0.22 | 0.33 | 0.69 | -0.03 | ... | 1.00 | 0.55 | 0.15 | 0.10 | 0.07 | -0.62 | -0.62 | -0.61 | -0.62 | -0.62 |
| Composure | -0.72 | -0.38 | 0.39 | 0.73 | 0.44 | 0.75 | 0.39 | 0.28 | 0.59 | -0.17 | ... | 0.55 | 1.00 | 0.38 | 0.35 | 0.32 | -0.38 | -0.38 | -0.37 | -0.37 | -0.38 |
| Marking | -0.28 | -0.11 | 0.14 | 0.29 | 0.16 | 0.56 | 0.12 | 0.07 | 0.24 | -0.14 | ... | 0.15 | 0.38 | 1.00 | 0.91 | 0.90 | -0.55 | -0.55 | -0.55 | -0.55 | -0.55 |
| StandingTackle | -0.25 | -0.09 | 0.12 | 0.25 | 0.14 | 0.54 | 0.09 | 0.04 | 0.21 | -0.13 | ... | 0.10 | 0.35 | 0.91 | 1.00 | 0.97 | -0.53 | -0.53 | -0.53 | -0.53 | -0.53 |
| SlidingTackle | -0.22 | -0.07 | 0.10 | 0.22 | 0.13 | 0.51 | 0.08 | 0.03 | 0.18 | -0.12 | ... | 0.07 | 0.32 | 0.90 | 0.97 | 1.00 | -0.51 | -0.51 | -0.51 | -0.51 | -0.51 |
| GKDiving | 0.03 | -0.11 | 0.10 | -0.03 | -0.05 | -0.67 | 0.00 | -0.23 | -0.62 | 0.00 | ... | -0.62 | -0.38 | -0.55 | -0.53 | -0.51 | 1.00 | 0.97 | 0.97 | 0.97 | 0.97 |
| GKHandling | 0.03 | -0.11 | 0.11 | -0.03 | -0.05 | -0.67 | 0.00 | -0.23 | -0.62 | 0.00 | ... | -0.62 | -0.38 | -0.55 | -0.53 | -0.51 | 0.97 | 1.00 | 0.97 | 0.97 | 0.97 |
| GKKicking | 0.03 | -0.11 | 0.10 | -0.03 | -0.06 | -0.67 | 0.00 | -0.23 | -0.62 | 0.00 | ... | -0.61 | -0.37 | -0.55 | -0.53 | -0.51 | 0.97 | 0.97 | 1.00 | 0.96 | 0.97 |
| GKPositioning | 0.02 | -0.12 | 0.12 | -0.02 | -0.05 | -0.67 | 0.01 | -0.23 | -0.62 | -0.00 | ... | -0.62 | -0.37 | -0.55 | -0.53 | -0.51 | 0.97 | 0.97 | 0.96 | 1.00 | 0.97 |
| GKReflexes | 0.03 | -0.11 | 0.10 | -0.02 | -0.05 | -0.67 | 0.00 | -0.23 | -0.62 | 0.00 | ... | -0.62 | -0.38 | -0.55 | -0.53 | -0.51 | 0.97 | 0.97 | 0.97 | 0.97 | 1.00 |
46 rows × 46 columns
#Correlation matrix to heat map
sns.heatmap(cormat);
import pingouin as pg
pg.corr(x=df['Potential'], y=df['Overall'])
--------------------------------------------------------------------------- FloatingPointError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_4616\4157861621.py in ?() 1 import pingouin as pg ----> 2 pg.corr(x=df['Potential'], y=df['Overall']) ~\anaconda3\Lib\site-packages\pingouin\correlation.py in ?(x, y, alternative, method, **kwargs) 600 n = x.size 601 602 # Compute correlation coefficient and two-sided p-value 603 if method == "pearson": --> 604 r, pval = pearsonr(x, y) 605 elif method == "spearman": 606 r, pval = spearmanr(x, y, **kwargs) 607 elif method == "kendall": ~\anaconda3\Lib\site-packages\scipy\stats\_stats_py.py in ?(x, y, alternative, method) 4856 # hypothesis is the beta distribution on (-1, 1) with a = b = n/2 - 1. 4857 ab = n/2 - 1 4858 dist = stats.beta(ab, ab, loc=-1, scale=2) 4859 if alternative == 'two-sided': -> 4860 prob = 2*dist.sf(abs(r)) 4861 elif alternative == 'less': 4862 prob = dist.cdf(r) 4863 elif alternative == 'greater': ~\anaconda3\Lib\site-packages\scipy\stats\_distn_infrastructure.py in ?(self, x) 493 def sf(self, x): --> 494 return self.dist.sf(x, *self.args, **self.kwds) ~\anaconda3\Lib\site-packages\scipy\stats\_distn_infrastructure.py in ?(self, x, *args, **kwds) 2155 place(output, (1-cond0)+np.isnan(x), self.badvalue) 2156 place(output, cond2, 1.0) 2157 if np.any(cond): 2158 goodargs = argsreduce(cond, *((x,)+args)) -> 2159 place(output, cond, self._sf(*goodargs)) 2160 if output.ndim == 0: 2161 return output[()] 2162 return output ~\anaconda3\Lib\site-packages\scipy\stats\_continuous_distns.py in ?(self, x, a, b) 692 def _sf(self, x, a, b): --> 693 return _boost._beta_sf(x, a, b) FloatingPointError: underflow encountered in _beta_sf